DiViMe icon indicating copy to clipboard operation
DiViMe copied to clipboard

consider adding a recipes folder

Open alecristia opened this issue 7 years ago • 1 comments

it could contain:

  • full_report.sh: a complete list of "reasonable" pipelines
  • sample.sh: extract samples with speech for further processing (an overly complex example, which does several other things, is ubuntu_field_data_process.py below)
  • highvoc.sh: the routine Marvin created to find high voc regions

full_report.sh (fake)

vagrant up

for sad in ldcSad noisemesSad etc ; do for diar in diartk ; do vagrant ssh -c "${sad}.sh data/" vagrant ssh -c "${diar}.sh data/ ${sad}" vagrant ssh -c "evalDiar.sh data/ ${diar}_${sad}" done done

for role in yunitator ; do vagrant ssh -c "yunitator.sh data/" vagrant ssh -c "evalDiar.sh data/ yunitator" done

for voccat in vcm ; do vagrant ssh -c "vcm.sh data/" vagrant ssh -c "evalVocCat.sh data/ vcm" done

vagrant down

ubuntu_field_data_process.py

import os import codecs import shutil import re import sys import shlex import subprocess import wave import contextlib from datetime import datetime

'''

Concatenate all the wav files in the folder which corresponds the one daylong recording for one child. Skip the first hour Cut into 15-second chunks every 15 minutes Has the chunks analysed with DiViMe (tell if and when there are vocalizations) If two vocalizations are separated by less than 3 seconds --> conversational block --> new recording that we can put on Lig Aikuma

'''

startTime = datetime.now()

main_path = "/Users/acristia/Documents/2018_campobello/to_process/" os.chdir(main_path)

list_child_folders = os.listdir(main_path) # list of folders where one folder = one child for folder in list_child_folders: os.chdir(main_path+folder) # going in the child's folder list_child_files = os.listdir(os.getcwd()) # list of all the recordings in the folder chunks_folder = folder+"_chunks" os.mkdir(chunks_folder) # will contain the extracted chunks converted_folder = folder+"_converted" os.mkdir(converted_folder) # will contain the converted files originals_folder = folder+"_originals" os.mkdir(originals_folder) ligaikuma_path = '/Users/acristia/Documents/2018_campobello/for_ligaikuma/'

for f in list_child_files:
	subprocess.call(shlex.split('sox {} -c 1 -b 16 -r 16k ./{}/{}'.format(f,converted_folder,f)))
	subprocess.call(shlex.split('mv ./{} ./{}/{}'.format(f,originals_folder,f)))
os.chdir(main_path+folder+"/"+converted_folder)
list_child_files_converted = os.listdir(os.getcwd())
for cf in list_child_files_converted:
	out_full = "FULL_"+folder+".wav" # new file for all the recordings concatenated
subprocess.call(shlex.split('sox {} ../{}'.format(" ".join(list_child_files_converted),out_full))) # concatenation

# getting total duration
os.chdir(main_path+folder)
with contextlib.closing(wave.open(out_full,'r')) as f:
	frames = f.getnframes()
	rate = f.getframerate()
	full_duration = frames / float(rate)

# variables in seconds
warmup = "0"+str(3600) # skip the first hour
chunklength = 15 # what we want to get
skip = 900 # skip 15 minutes

# cutting 15-second chunks every 15 minutes
while full_duration>int(warmup):
	out_chunk = "chunk_"+folder+"_"+str(warmup)
	os.chdir("./{}".format(chunks_folder))
	out_chunk_folder = os.mkdir(out_chunk)
	os.chdir("../")
	subprocess.call(shlex.split('sox {} ./{}/{}/{}.wav trim {} {}'.format(out_full,chunks_folder,out_chunk,out_chunk,warmup,chunklength)))
	warmup=int(warmup)+chunklength+skip
	### in order to always have the same length
	if len(str(warmup)) == 4:
		warmup = "0"+str(warmup)
	# analysis of the wav file with LDC SAD directly ##ALEX!!!

	#subprocess.call(shlex.split('cp -r ./{}/{}/{}.wav /Users/acristia/Documents/speech_tools/ldc_sad_hmm/data/.'.format(chunks_folder,out_chunk,out_chunk)))
	divime_path='/Users/acristia/Documents/speech_tools/ldc_sad_hmm/'
	os.chdir(divime_path)
	#subprocess.call(shlex.split('vagrant up'))
	#subprocess.call(shlex.split("vagrant ssh -c 'tools/ldc_sad.sh data/'")) # analysis...
	subprocess.call(shlex.split('python perform_sad.py -L {}/{}/{}/{} {}/{}/{}/{}/{}.wav'.format(main_path,folder,chunks_folder,out_chunk,main_path,folder,chunks_folder,out_chunk,out_chunk)))

	# moving the analysed files to the child's folder > chunks folder > one folder for one chunk
	#os.chdir("./data")
	#subprocess.call(shlex.split("mv ./{}.lab {}{}/{}/{}/".format(out_chunk,main_path,folder,chunks_folder,out_chunk)))
	#subprocess.call(shlex.split("mv ./ldc_sad_{}.rttm {}{}/{}/{}/".format(out_chunk,main_path,folder,chunks_folder,out_chunk)))
	#subprocess.call(shlex.split("mv ./{}.wav {}{}/{}/{}/".format(out_chunk,main_path,folder,chunks_folder,out_chunk)))
	
	# if vocalization : if less than 3 seconds between two vocalizations --> conversational block (cutting...)
	os.chdir(main_path+folder+"/"+chunks_folder+"/"+out_chunk)
	chunk_conv_blocks = out_chunk+"_blocks"
	os.mkdir(chunk_conv_blocks)
	lab_file = open("./{}.lab".format(out_chunk),"r")
	lab_lines = lab_file.readlines()
	i=0
	times_list=[]
	while i<len(lab_lines):
		r = re.match("(.+)\s(.+)\sspeech",lab_lines[i])
		if r:
			begin = r.group(1)
			end = r.group(2)
			times_list.append(begin)
			times_list.append(end) 
		i=i+1	

	if len(times_list) == 0:
		pass
	elif len(times_list) == 2: # if there is only one line in the .lab file
		conv_block = out_chunk+"_"+times_list[0]+"_"+times_list[1]
		subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[0],times_list[1])))
	else: # if more than one line ...
		i=0
		while i<len(times_list)-2:
			if float(times_list[i+2])-float(times_list[i+1]) < 3:
				del times_list[i+1]
				del times_list[i+1]
				if len(times_list) == 2:
					conv_block = out_chunk+"_"+times_list[0]+"_"+times_list[1]
					subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[0],times_list[1])))
			elif float(times_list[i+2])-float(times_list[i+1]) > 3: # if two vocalisations are separated by more than 3 seconds, we cut
				conv_block = out_chunk+"_"+times_list[i]+"_"+times_list[i+1]
				subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[i],times_list[i+1])))
				if times_list[i+3] == times_list[-1]: # don't forget the last element of the list otherwise it won't be taken into account !
					conv_block = out_chunk+"_"+times_list[i+2]+"_"+times_list[i+3]
					subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[i+2],times_list[i+3])))
				i=i+2
		else:
			conv_block = out_chunk+"_"+times_list[i]+"_"+times_list[i+1]
			subprocess.call(shlex.split('sox {}.wav ./{}/{}.wav trim {} {}'.format(out_chunk,chunk_conv_blocks,conv_block,times_list[i],times_list[i+1])))

	# to lig aikuma
	os.chdir(main_path+folder+"/"+chunks_folder+"/"+out_chunk+"/"+chunk_conv_blocks)
	list_blocks = os.listdir(os.getcwd())
	os.chdir(main_path+folder+"/"+chunks_folder+"/"+out_chunk)
	for b in list_blocks:
		subprocess.call(shlex.split("cp ./{}/{} /Users/acristia/Documents/2018_campobello/for_ligaikuma/".format(chunk_conv_blocks,b)))
	os.chdir(main_path+folder)

# end of process
os.chdir(main_path)
subprocess.call(shlex.split("mv ./{} /Users/acristia/Documents/2018_campobello/processed/".format(folder))) # moving in the processed folder
os.chdir(ligaikuma_path)
lookup_file = folder+"_lookup.txt"
lookup = codecs.open(lookup_file,"w","utf-8")
try:
	os.mkdir("temp")
except FileExistsError:
	pass
try:
	os.mkdir("final_results")
except FileExistsError:
	pass
os.chdir(ligaikuma_path+"temp")
subprocess.call(shlex.split("sox -n -r 16k -c 1 silence.wav trim 0.0 2.0"))
os.chdir(ligaikuma_path+"final_results")
try:
	os.mkdir("for_form")
except FileExistsError:
	pass
os.chdir(ligaikuma_path)
list_blocks = os.listdir(os.getcwd())
list_blocks.sort()
list_wav = [wav for wav in list_blocks if wav.endswith('.wav') or wav.endswith('.WAV')]
cmd = "sox "
with contextlib.closing(wave.open(list_wav[0],'r')) as f:
	frames = f.getnframes()
	rate = f.getframerate()
	duration = frames / float(rate)
	off = duration
	offset = str(off)
first_element = os.path.splitext(list_wav[0])
first_element_name = first_element[0]+first_element[1]
lookup.write(first_element_name+"\t"+"0"+"\t"+offset+"\n")
del list_wav[0]
for wav in list_wav:
	wav_n = os.path.splitext(wav)
	wav_name = wav_n[0]+wav_n[1]
	with contextlib.closing(wave.open(wav,'r')) as f:
		frames = f.getnframes()
		rate = f.getframerate()
		duration = frames / float(rate)
	on = off+2
	onset = str(on)
	off = on + duration
	offset = str(off)
	lookup.write(wav_name+"\t"+onset+"\t"+offset+"\n")
	cmd += wav + " temp/silence.wav "
cmd += "final_results/{}.wav".format(folder)
#print(cmd)
subprocess.call(shlex.split(cmd))
list_wav2 = [wav for wav in list_blocks if wav.endswith('.wav') or wav.endswith('.WAV')]
for wav in list_wav2:
	os.remove(wav)
shutil.rmtree("temp")
subprocess.call(shlex.split("mv {} ./final_results/for_form".format(lookup_file)))

print(datetime.now() - startTime)

alecristia avatar Nov 28 '18 23:11 alecristia

any interest in putting together an example recipe for the CogSci tutorial on July 24?

marisacasillas avatar Jul 15 '19 12:07 marisacasillas