consider adding a recipes folder
it could contain:
- full_report.sh: a complete list of "reasonable" pipelines
- sample.sh: extract samples with speech for further processing (an overly complex example, which does several other things, is ubuntu_field_data_process.py below)
- highvoc.sh: the routine Marvin created to find high voc regions
full_report.sh (fake)
vagrant up
for sad in ldcSad noisemesSad etc ; do for diar in diartk ; do vagrant ssh -c "${sad}.sh data/" vagrant ssh -c "${diar}.sh data/ ${sad}" vagrant ssh -c "evalDiar.sh data/ ${diar}_${sad}" done done
for role in yunitator ; do vagrant ssh -c "yunitator.sh data/" vagrant ssh -c "evalDiar.sh data/ yunitator" done
for voccat in vcm ; do vagrant ssh -c "vcm.sh data/" vagrant ssh -c "evalVocCat.sh data/ vcm" done
vagrant down
ubuntu_field_data_process.py
import os import codecs import shutil import re import sys import shlex import subprocess import wave import contextlib from datetime import datetime
'''
Concatenate all the wav files in the folder which corresponds the one daylong recording for one child. Skip the first hour Cut into 15-second chunks every 15 minutes Has the chunks analysed with DiViMe (tell if and when there are vocalizations) If two vocalizations are separated by less than 3 seconds --> conversational block --> new recording that we can put on Lig Aikuma
'''
startTime = datetime.now()
main_path = "/Users/acristia/Documents/2018_campobello/to_process/" os.chdir(main_path)
list_child_folders = os.listdir(main_path) # list of folders where one folder = one child for folder in list_child_folders: os.chdir(main_path+folder) # going in the child's folder list_child_files = os.listdir(os.getcwd()) # list of all the recordings in the folder chunks_folder = folder+"_chunks" os.mkdir(chunks_folder) # will contain the extracted chunks converted_folder = folder+"_converted" os.mkdir(converted_folder) # will contain the converted files originals_folder = folder+"_originals" os.mkdir(originals_folder) ligaikuma_path = '/Users/acristia/Documents/2018_campobello/for_ligaikuma/'
for f in list_child_files:
subprocess.call(shlex.split('sox {} -c 1 -b 16 -r 16k ./{}/{}'.format(f,converted_folder,f)))
subprocess.call(shlex.split('mv ./{} ./{}/{}'.format(f,originals_folder,f)))
os.chdir(main_path+folder+"/"+converted_folder)
list_child_files_converted = os.listdir(os.getcwd())
for cf in list_child_files_converted:
out_full = "FULL_"+folder+".wav" # new file for all the recordings concatenated
subprocess.call(shlex.split('sox {} ../{}'.format(" ".join(list_child_files_converted),out_full))) # concatenation
# getting total duration
os.chdir(main_path+folder)
with contextlib.closing(wave.open(out_full,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
full_duration = frames / float(rate)
# variables in seconds
warmup = "0"+str(3600) # skip the first hour
chunklength = 15 # what we want to get
skip = 900 # skip 15 minutes
# cutting 15-second chunks every 15 minutes
while full_duration>int(warmup):
out_chunk = "chunk_"+folder+"_"+str(warmup)
os.chdir("./{}".format(chunks_folder))
out_chunk_folder = os.mkdir(out_chunk)
os.chdir("../")
subprocess.call(shlex.split('sox {} ./{}/{}/{}.wav trim {} {}'.format(out_full,chunks_folder,out_chunk,out_chunk,warmup,chunklength)))
warmup=int(warmup)+chunklength+skip
### in order to always have the same length
if len(str(warmup)) == 4:
warmup = "0"+str(warmup)
# analysis of the wav file with LDC SAD directly ##ALEX!!!
#subprocess.call(shlex.split('cp -r ./{}/{}/{}.wav /Users/acristia/Documents/speech_tools/ldc_sad_hmm/data/.'.format(chunks_folder,out_chunk,out_chunk)))
divime_path='/Users/acristia/Documents/speech_tools/ldc_sad_hmm/'
os.chdir(divime_path)
#subprocess.call(shlex.split('vagrant up'))
#subprocess.call(shlex.split("vagrant ssh -c 'tools/ldc_sad.sh data/'")) # analysis...
subprocess.call(shlex.split('python perform_sad.py -L {}/{}/{}/{} {}/{}/{}/{}/{}.wav'.format(main_path,folder,chunks_folder,out_chunk,main_path,folder,chunks_folder,out_chunk,out_chunk)))
# moving the analysed files to the child's folder > chunks folder > one folder for one chunk
#os.chdir("./data")
#subprocess.call(shlex.split("mv ./{}.lab {}{}/{}/{}/".format(out_chunk,main_path,folder,chunks_folder,out_chunk)))
#subprocess.call(shlex.split("mv ./ldc_sad_{}.rttm {}{}/{}/{}/".format(out_chunk,main_path,folder,chunks_folder,out_chunk)))
#subprocess.call(shlex.split("mv ./{}.wav {}{}/{}/{}/".format(out_chunk,main_path,folder,chunks_folder,out_chunk)))
# if vocalization : if less than 3 seconds between two vocalizations --> conversational block (cutting...)
os.chdir(main_path+folder+"/"+chunks_folder+"/"+out_chunk)
chunk_conv_blocks = out_chunk+"_blocks"
os.mkdir(chunk_conv_blocks)
lab_file = open("./{}.lab".format(out_chunk),"r")
lab_lines = lab_file.readlines()
i=0
times_list=[]
while i<len(lab_lines):
r = re.match("(.+)\s(.+)\sspeech",lab_lines[i])
if r:
begin = r.group(1)
end = r.group(2)
times_list.append(begin)
times_list.append(end)
i=i+1
if len(times_list) == 0:
pass
elif len(times_list) == 2: # if there is only one line in the .lab file
conv_block = out_chunk+"_"+times_list[0]+"_"+times_list[1]
subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[0],times_list[1])))
else: # if more than one line ...
i=0
while i<len(times_list)-2:
if float(times_list[i+2])-float(times_list[i+1]) < 3:
del times_list[i+1]
del times_list[i+1]
if len(times_list) == 2:
conv_block = out_chunk+"_"+times_list[0]+"_"+times_list[1]
subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[0],times_list[1])))
elif float(times_list[i+2])-float(times_list[i+1]) > 3: # if two vocalisations are separated by more than 3 seconds, we cut
conv_block = out_chunk+"_"+times_list[i]+"_"+times_list[i+1]
subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[i],times_list[i+1])))
if times_list[i+3] == times_list[-1]: # don't forget the last element of the list otherwise it won't be taken into account !
conv_block = out_chunk+"_"+times_list[i+2]+"_"+times_list[i+3]
subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[i+2],times_list[i+3])))
i=i+2
else:
conv_block = out_chunk+"_"+times_list[i]+"_"+times_list[i+1]
subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[i],times_list[i+1])))
# to lig aikuma
os.chdir(main_path+folder+"/"+chunks_folder+"/"+out_chunk+"/"+chunk_conv_blocks)
list_blocks = os.listdir(os.getcwd())
os.chdir(main_path+folder+"/"+chunks_folder+"/"+out_chunk)
for b in list_blocks:
subprocess.call(shlex.split("cp ./{}/{} /Users/acristia/Documents/2018_campobello/for_ligaikuma/".format(chunk_conv_blocks,b)))
os.chdir(main_path+folder)
# end of process
os.chdir(main_path)
subprocess.call(shlex.split("mv ./{} /Users/acristia/Documents/2018_campobello/processed/".format(folder))) # moving in the processed folder
os.chdir(ligaikuma_path)
lookup_file = folder+"_lookup.txt"
lookup = codecs.open(lookup_file,"w","utf-8")
try:
os.mkdir("temp")
except FileExistsError:
pass
try:
os.mkdir("final_results")
except FileExistsError:
pass
os.chdir(ligaikuma_path+"temp")
subprocess.call(shlex.split("sox -n -r 16k -c 1 silence.wav trim 0.0 2.0"))
os.chdir(ligaikuma_path+"final_results")
try:
os.mkdir("for_form")
except FileExistsError:
pass
os.chdir(ligaikuma_path)
list_blocks = os.listdir(os.getcwd())
list_blocks.sort()
list_wav = [wav for wav in list_blocks if wav.endswith('.wav') or wav.endswith('.WAV')]
cmd = "sox "
with contextlib.closing(wave.open(list_wav[0],'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
off = duration
offset = str(off)
first_element = os.path.splitext(list_wav[0])
first_element_name = first_element[0]+first_element[1]
lookup.write(first_element_name+"\t"+"0"+"\t"+offset+"\n")
del list_wav[0]
for wav in list_wav:
wav_n = os.path.splitext(wav)
wav_name = wav_n[0]+wav_n[1]
with contextlib.closing(wave.open(wav,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
on = off+2
onset = str(on)
off = on + duration
offset = str(off)
lookup.write(wav_name+"\t"+onset+"\t"+offset+"\n")
cmd += wav + " temp/silence.wav "
cmd += "final_results/{}.wav".format(folder)
#print(cmd)
subprocess.call(shlex.split(cmd))
list_wav2 = [wav for wav in list_blocks if wav.endswith('.wav') or wav.endswith('.WAV')]
for wav in list_wav2:
os.remove(wav)
shutil.rmtree("temp")
subprocess.call(shlex.split("mv {} ./final_results/for_form".format(lookup_file)))
print(datetime.now() - startTime)
any interest in putting together an example recipe for the CogSci tutorial on July 24?