Commit ea7a9ddb authored by Nikos Athanasiou's avatar Nikos Athanasiou
Browse files

first commit a bit of analysis, added shape experiments

parent b47274e3
# AMASS-BABEL analysis
`ln -s /is/cluster/nathanasiou/data`
\ No newline at end of file
import sys
import argparse
from loguru import logger
import json
import glob
import joblib
from tqdm import tqdm
import os
from utils import read_json, write_json, fix_spell
'''
RUN EXAMPLE
python divotion/dataset/add_babel_labels.py \
--input-path /is/cluster/nathanasiou/data/amass/processed_amass_smplh_wshape_30fps \
--out-path /is/cluster/nathanasiou/data/babel/babel-smplh30fps-gender \
--babel-path /is/cluster/nathanasiou/data/babel/babel_v2.1/
'''
def extract_frame_labels(babel_labels, fps, seqlen):
seg_ids = []
seg_acts = []
if babel_labels['frame_ann'] is None:
# 'transl' 'pose''betas'
action_label = babel_labels['seq_ann']['labels'][0]['proc_label']
seg_ids.append([0, seqlen])
seg_acts.append(fix_spell(action_label))
else:
for seg_an in babel_labels['frame_ann']['labels']:
action_label = fix_spell(seg_an['proc_label'])
st_f = int(seg_an['start_t']*fps)
end_f = int(seg_an['end_t']*fps)
if end_f > seqlen:
end_f = seqlen
seg_ids.append([st_f, end_f])
seg_acts.append(action_label)
return seg_ids, seg_acts
def process_data(input_dir, out_dir, amass2babel, babel_data_train,
babel_data_val):
amass_subsets = glob.glob(f'{input_dir}/*/*')
babel_keys = list(babel_data_train.keys()) + list(babel_data_test.keys()) + list(babel_data_val.keys())
for am_s_path in amass_subsets:
amass_subset = joblib.load(am_s_path)
logger.info(f'Loading the dataset from {am_s_path}')
dataset_db_lists = {'train': [],
'val': [],
'test': []
}
for sample in tqdm(amass_subset):
if sample['fname'] not in amass2babel:
continue
split_of_seq = amass2babel[sample['fname']]['split']
babel_seq_id = amass2babel[sample['fname']]['babel_id']
if split_of_seq == 'train':
babel_data_seq = babel_data_train[babel_seq_id]
elif split_of_seq == 'val':
babel_data_seq = babel_data_val[babel_seq_id]
elif split_of_seq == 'test':
babel_data_seq = babel_data_test[babel_seq_id]
seg_indices, seg_actions = extract_frame_labels(babel_data_seq,
sample['fps'],
sample['poses'].shape[0])
for index, seg in enumerate(seg_indices):
sample_babel = {}
sample_babel['fps'] = sample['fps']
sample_babel['fname'] = sample['fname']
for ams_k in ['poses', 'trans', 'joint_positions', 'markers']:
sample_babel[ams_k] = sample[ams_k][seg[0]:seg[1]]
sample_babel['action'] = seg_actions[index]
dataset_db_lists[split_of_seq].append(sample_babel)
os.makedirs(out_dir, exist_ok=True)
for k, v in dataset_db_lists.items():
joblib.dump(v, f'{out_dir}/{k}.pth.tar')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
parser.add_argument('--babel-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
parser.add_argument('--out-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
args = parser.parse_args()
input_dir = args.input_path
babel_dir = args.babel_path
out_dir = args.out_path
logger.info(f'Input arguments: \n {args}')
babel_data_train = read_json(f'{babel_dir}/train.json')
babel_data_val = read_json(f'{babel_dir}/val.json')
babel_data_test = read_json(f'{babel_dir}/test.json')
amass2babel = read_json(f'{babel_dir}/id2fname/amass-path2babel.json')
db = process_data(input_dir, out_dir, amass2babel,
babel_data_train, babel_data_val)
import argparse
from loguru import logger
import glob
import joblib
from tqdm import tqdm
from plot_utils import make_hist_bokeh
from utils import read_json, write_json, fix_spell
import numpy as np
'''
RUN EXAMPLE
python divotion/dataset/add_babel_labels.py \
--input-path /is/cluster/nathanasiou/data/amass/processed_amass_smplh_wshape_30fps \
--out-path /is/cluster/nathanasiou/data/babel/babel-smplh30fps-gender \
--babel-path /is/cluster/nathanasiou/data/babel/babel_v2.1/
'''
def process_data(input_dir, out_dir, amass2babel, babel_data_train,
babel_data_val):
shapes = []
amass_subsets = glob.glob(f'{input_dir}/*/*')
babel_keys = list(babel_data_train.keys()) + list(babel_data_test.keys()) + list(babel_data_val.keys())
for am_s_path in amass_subsets:
amass_subset = joblib.load(am_s_path)
for sample in tqdm(amass_subset):
shapes.append(sample['betas'][:3])
shapes = np.array(shapes)
for i in range(shapes.shape[1]):
hist, edges = np.histogram(shapes[:, i], bins='auto')
make_hist_bokeh(title=f'Histogram-Shape Distribution(AMASS) comp: {i}',
hist=hist, edges=edges,
xaxis='# Samples AMASS',
yaxis=f'counts',
save_path=f'{out_dir}/shapes_{i}th_component.html')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
parser.add_argument('--babel-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
parser.add_argument('--out-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
args = parser.parse_args()
input_dir = args.input_path
babel_dir = args.babel_path
out_dir = args.out_path
logger.info(f'Input arguments: \n {args}')
babel_data_train = read_json(f'{babel_dir}/train.json')
babel_data_val = read_json(f'{babel_dir}/val.json')
babel_data_test = read_json(f'{babel_dir}/test.json')
amass2babel = read_json(f'{babel_dir}/id2fname/amass-path2babel.json')
db = process_data(input_dir, out_dir, amass2babel,
babel_data_train, babel_data_val)
from loguru import logger
import argparse
from transformers import BertTokenizer, BertModel
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mean
import os
import numpy as np
import scipy.special
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show
from gensim.models import KeyedVectors
import sys
import json
from trimesh import tol
sys.path.append('.')
from plot_utils import make_hist_bokeh, save_bokeh_plot, w2vec_test, xy_plot_bokeh
from utils import read_json, write_json
import copy
from spellchecker import SpellChecker
# everything at the first level should contain these keys (val, train sets)
keys_babel_d1 = {'babel_sid', 'url', 'feat_p', 'dur', 'seq_ann', 'frame_ann'}
keys_babel_d2_s = {'anntr_id', 'babel_lid', 'mul_act', 'labels'}
keys_babel_d2_f = {'anntr_id', 'babel_lid', 'mul_act', 'labels'}
# xx = seq_labels[0]['proc_label']
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertModel.from_pretrained("bert-base-uncased")
# inputs = torch.tensor(tokenizer.encode(xx)).unsqueeze(0)
# outputs = model(inputs)
# print(outputs[0].shape)
# print(tokenizer.encode(xx), xx)
def t9(wds):
spell = SpellChecker()
wds = wds.strip().split()
wds = [spell.correction(w) for w in wds]
return ' '.join(wds)
def ratio_ab(a, b):
if b==0:
return 'Division by zero'
x = a/b * 100
return f'{round(x, 2)} % --> {a} / {b}'
def analyze_data(full, test, val, train, outd):
seg_durations = []
os.makedirs(outd, exist_ok=True)
st1_only = []
seg_durations = []
total_durs = []
seg_durations_wo_transition_fr = []
seg_durations_wo_transition_seq = []
for _, s in full.items():
seq_dur = s['dur']
if s['frame_ann'] is not None:
frame_labels = s['frame_ann']['labels']
for a in frame_labels:
cur_seg_dur = a['end_t'] - a['start_t']
seg_durations.append(cur_seg_dur)
if a['proc_label'] != 'transition':
seg_durations_wo_transition_fr.append(cur_seg_dur)
else:
st1_only.append(seq_dur)
seg_durations_wo_transition_seq.append(seq_dur)
total_durs.append(seq_dur)
# HISTOGRAMS FOR STATISTICS AND DISTRIBUTIONS
# with transitions
hist, edges = np.histogram(seg_durations_wo_transition_fr, bins='auto')
f1 = make_hist_bokeh(f'Durations Frame Segments wo Transitions[Full-Set]',
hist, edges, xaxis='durations', yaxis='# segs')
# save_path=f'{outd}/plots/{split_name}_segs_only.html')
hist, edges = np.histogram(seg_durations_wo_transition_seq, bins='auto')
f2 = make_hist_bokeh(f'Durations seqs with St1 only wo Transitions[Ful-Set]',
hist, edges, xaxis='durations', yaxis='# segs')
# save_path=f'{outd}/plots/{split_name}_STAGE1_only.html')
# without transitions
hist, edges = np.histogram(seg_durations, bins='auto')
f3 = make_hist_bokeh(f'Durations Frame Segments[Full-Set]',
hist, edges, xaxis='durations', yaxis='# segs')
hist, edges = np.histogram(st1_only, bins='auto')
f4 = make_hist_bokeh(f'Durations seqs with St1 only[Full-Set]',
hist, edges, xaxis='durations', yaxis='# segs')
# put all the plots in a grid layout
p1 = gridplot([[f1, f2], [f3, f4]])
save_bokeh_plot(p1, f'{outd}/plots/all_MERGED.html')
# NLP ANALYSIS
# from transformers import BertTokenizer
# from transformers import DistilBertTokenizer, DistilBertModel
# tz_dbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# # model = DistilBertModel.from_pretrained("distilbert-base-uncased")
# tz_bert = BertTokenizer.from_pretrained("bert-base-cased")
# #tz.tokenize('walk in place')
# sents = []
# for k, v in full.items():
# if v['frame_ann'] is not None:
# frame_labels = v['frame_ann']['labels']
# for a in frame_labels:
# sents.append(t9(a['proc_label']))
# else:
# seq_labels = v['seq_ann']['labels']
# for a in seq_labels:
# sents.append(t9(a['proc_label']))
# vocab_bert = []
# vocab_distillbert = []
# from collections import Counter
# for s in sents:
# vocab_bert.extend(tz_bert.tokenize(s))
# vocab_distillbert.extend(tz_dbert.tokenize(s))
# counts_bert = dict(Counter(vocab_bert))
# counts_dbert = dict(Counter(vocab_distillbert))
# freqs_bert = sorted(counts_bert.items(), key=lambda pair: pair[1], reverse=True)
# freqs_dbert = sorted(counts_dbert.items(), key=lambda pair: pair[1], reverse=True)
# vocab_bert = list(set(vocab_bert))
# vocab_distillbert = list(set(vocab_distillbert))
# print(vocab_bert)
# print(vocab_distillbert)
# print(f'===Len of BERT: {len(vocab_bert)}')
# print(f'===Len of DistillBERT: {len(vocab_distillbert)}')
dur_thresholds = np.linspace(10, 120, 100)
long_ration = []
seq_doms = []
more_than_one_long = []
actually_long = []
for dur_threshold in dur_thresholds:
no_of_longst2 = 0
no_of_longst1 = 0
tot_st1 = 0
tot_st2 = 0
label_long2 = {}
label_long1 = []
ds2 = []
ds1 = []
long_dict = []
j = 0
sents = []
full_labels = []
for k, v in full.items():
dataset_name = v['feat_p'].strip().split('/')[0]
if v['frame_ann'] is not None:
frame_labels = v['frame_ann']['labels']
first_time = True
for a in frame_labels:
tot_st2 += 1
cur_seg_dur = a['end_t'] - a['start_t']
if cur_seg_dur > dur_threshold:
if first_time:
label_long2[k] = []
first_time = False
no_of_longst2 += 1
label_long2[k].append([a['proc_label'],
round(cur_seg_dur, 2),
v['dur'],
round(round(cur_seg_dur, 2) / v['dur'], 2)
])
sents.append(a['proc_label'])
ds2.append(dataset_name)
if a['proc_label']:
sents = []
full_labels.append(a['proc_label'])
else:
tot_st1 += 1
seq_labels = v['seq_ann']['labels']
if v['dur'] > 15:
no_of_longst1 += 1
label_long1.append((seq_labels[0]['proc_label'], v['url']))
ds1.append(dataset_name)
full_labels.append(seq_labels[0]['proc_label'])
se = 0
more_than_big = 0
for k, v in label_long2.items():
if len(v) > 1:
more_than_big += 1
for subs in v:
if subs[3] > 0.75:
se += 1
long_ration += [(no_of_longst2 / tot_st2)*100]
seq_doms += [(se / len(label_long2))*100]
more_than_one_long += [(more_than_big / len(label_long2))*100]
f1 = xy_plot_bokeh(dur_thresholds, long_ration,
xaxis_label='Duration thresholds',
yaxis_label='Long frame segments(%)',
w=600, h=600)
#save_path=f'{out_dir}/long_seqs.html')
f2 = xy_plot_bokeh(dur_thresholds, seq_doms,
xaxis_label='Duration thresholds',
yaxis_label='Frame segments that comprise 0.8 of seq. (%)',
w=600, h=600)
# save_path=f'{out_dir}/seq_doms.html')
f3 = xy_plot_bokeh(dur_thresholds, more_than_one_long,
xaxis_label='Duration thresholds',
yaxis_label='Sequences with >1 long segments(%)',
w=600, h=600)
# save_path=f'{out_dir}/more_than_one.html')
p1 = gridplot([[f1, f2], [f3, None]])
save_bokeh_plot(p1, f'{outd}/plots/stage2_long_stats.html')
print(ratio_ab(no_of_longst2, tot_st2))
exit()
# https://babel-renders.s3.eu-central-1.amazonaws.com/
exit()
word_vpath = '/is/cluster/nathanasiou/data/GoogleNews-vectors-negative300.bin'
kv_model = KeyedVectors.load_word2vec_format(word_vpath, binary=True)
oov = []
word_list = list(set(word_list))
from spellchecker import SpellChecker
spell = SpellChecker()
spell.correction(single_w)
import nltk
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")
real_words = []
print('=====Started filtering word embeddings')
for i, w in enumerate(word_list):
words_of_phrase = w.strip().split()
words_of_phrase = [spell.correction(single_w) for single_w in words_of_phrase]
for sw in words_of_phrase:
if sw in cachedStopWords:
continue
if not kv_model.has_index_for(sw):
oov.append(sw)
else:
real_words.append(' '.join(words_of_phrase))
print('Finished filtering word embeddings======')
w2vec_test(real_words, f'{outd}/embeddings.html')
# print(oov)
print(ratio_ab(len(oov), len(word_list)))
def check_dict_structure(bbl_d):
for k, v in bbl_d.items():
assert set(v.keys()) == keys_babel_d1, v.keys()
assert set(v['seq_ann'].keys()) == keys_babel_d2_s
if v['frame_ann'] is not None:
assert set(v['frame_ann'].keys()) == keys_babel_d2_f
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--babel-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
parser.add_argument('--out-path', required=True, type=str,
help='input path of AMASS data in unzipped format without anything else.')
args = parser.parse_args()
babel_dir = args.babel_path
out_dir = args.out_path
logger.info(f'Input arguments: \n {args}')
babel_data_test = read_json(f'{babel_dir}/test.json')
babel_data_train = read_json(f'{babel_dir}/train.json')
babel_data_val = read_json(f'{babel_dir}/val.json')
babel_data_full = read_json(f'{babel_dir}/babel-full.json')
check_dict_structure(babel_data_train)
check_dict_structure(babel_data_test)
check_dict_structure(babel_data_val)
check_dict_structure(babel_data_full)
db = analyze_data(babel_data_full, babel_data_test,
babel_data_val, babel_data_train, out_dir)
'''
python divotion/dataset/babel_analysis.py
--out-path ~/shared_logs/babel2.1-analysis
--babel-path /is/cluster/nathanasiou/data/babel/babel_v2.1
'''
# with open('/is/cluster/nathanasiou/data/amass/amass_cleanup_seqs/BMLrub.json', 'r') as f:
# bml_skate = json.load(f)
# with open('/is/cluster/nathanasiou/data/amass/amass_cleanup_seqs/MPIHDM05.json', 'r') as f:
# mpi_skate = json.load(f)
# seqs_bml = []
# seqs_mpi = []
# sq_a_mpi = 0
# fr_a_mpi = 0
# sq_a_bml = 0
# fr_a_bml = 0
# full_new = copy.deepcopy(bbl_f)
# print(40*'=')
# i = 0
# from tqdm import tqdm
# lll= {'treadmill_slow':('walk slowly in place', 'walk'),
# 'treadmill_fast':('walk fast in place', 'walk'),
# 'treadmill_norm':('walk in place', 'walk'),
# 'normal_walk': ('walk in place', 'walk'),
# 'treadmill_jog':('jog in place', 'jog'),
# 'normal_jog':('jog in place', 'jog')}
# for k, s in full_new.items():
# p = s['feat_p']
# if p in mpi_skate:
# i+=1
# print(s['url'])
# print(p)
# print(40*'=')
# seqs_mpi.append(s['url'])
# if s['frame_ann'] is not None:
# print(s['frame_ann']['labels'])
# nl = input('Labels for current sequence:')
# nl = nl.split(',')
# new_annot = {}
# new_annot['act_cat'] = [nl[1]]
# new_annot['proc_label'] = nl[0]
# new_annot['raw_label'] = nl[0]
# new_annot['seg_id'] = s['frame_ann']['labels'][0]['seg_id']
# full_new[k]['frame_ann']['labels'] = []
# full_new[k]['frame_ann']['labels'].append(new_annot)
# fr_a_mpi += 1
# print(40*'=')
# else:
# print(s['seq_ann']['labels'])
# nl = input('Labels for current sequence:')
# nl = nl.split(',')
# new_annot = {}
# new_annot['act_cat'] = [nl[1]]
# new_annot['proc_label'] = nl[0]
# new_annot['raw_label'] = nl[0]
# new_annot['seg_id'] = s['seq_ann']['labels'][0]['seg_id']
# full_new[k]['seq_ann']['labels'] = []
# full_new[k]['seq_ann']['labels'].append(new_annot)
# sq_a_mpi += 1
# print(40*'=')
# if p in bml_skate:
# i+=1
# print(s['url'])
# print(p)
# print(40*'=')
# seqs_bml.append(s['url'])
# for fn, annot in lll.items():
# if fn in p:
# nl = annot
# found_it = True
# break
# if s['frame_ann'] is not None:
# print(s['frame_ann']['labels'])
# new_annot = {}
# new_annot['act_cat'] = [nl[1]]
# new_annot['proc_label'] = nl[0]
# new_annot['raw_label'] = nl[0]
# new_annot['seg_id'] = s['frame_ann']['labels'][0]['seg_id']
# full_new[k]['frame_ann']['labels'] = []
# full_new[k]['frame_ann']['labels'].append(new_annot)
# fr_a_bml += 1
# print(40*'=')
# else:
# print(s['seq_ann']['labels'])
# new_annot = {}
# new_annot['act_cat'] = [nl[1]]
# new_annot['proc_label'] = nl[0]
# new_annot['raw_label'] = nl[0]
# new_annot['seg_id'] = s['seq_ann']['labels'][0]['seg_id']
# full_new[k]['seq_ann']['labels'] = []
# full_new[k]['seq_ann']['labels'].append(new_annot)
# print(40*'=')
# sq_a_bml += 1
# write_json(full_new,
# '/is/cluster/nathanasiou/data/babel/babel_v2.0/babel_full_v2.0.json')
# exit()
import smplx
def get_body_model(model_type, gender, batch_size, device='cpu', ext='pkl'):
'''
type: smpl, smplx smplh and others. Refer to smplx tutorial
gender: male, female, neutral
batch_size: an positive integar
'''
mtype = model_type.upper()
if gender != 'neutral':
if not isinstance(gender, str):
gender = str(gender.astype(str)).upper()
else:
gender = gender.upper()
else:
gender = gender.upper()
body_model_path = f'data/smpl_models/{model_type}/{mtype}_{gender}.{ext}'
body_model = smplx.create(body_model_path, model_type=type,
gender=gender, ext=ext,
use_pca=False,
num_pca_comps=12,
create_global_orient=True,
create_body_pose=True,
create_betas=True,
create_left_hand_pose=True,
create_right_hand_pose=True,
create_expression=True,
create_jaw_pose=True,
create_leye_pose=True,
create_reye_pose=True,
create_transl=True,
batch_size=batch_size)
if device == 'cuda':
return body_model.cuda()
else:
return body_model