问题描述
我想标记然后提取音频文件(audio.wav)的某些片段。段的开始和结束时间由DateTimeStamp(第一列)和动作持续时间(以毫秒为单位)(第三列)在另一个文件(注释文件(annot.csv))中给出:
DateTimeStamp Action Duration of action in milliseconds
04/16/20 21:25:36:241 A 502
04/16/20 21:25:36:317 B 2253
04/16/20 21:25:36:734 X 118
04/16/20 21:25:36:837 C 10
04/16/20 21:25:37:537 D 797
04/16/20 21:25:37:606 X 70
04/16/20 21:25:37:874 A 1506
. . .
audio.wav文件在文件annot.csv的第一个DateTimeStamp时间开始。我该如何使用annot.csv文件中的信息来标记和提取audio.wav文件中的某个片段(例如,对应于Action X)?
我尝试使用librosa和pyAudioAnalysis软件包解决它,但找不到所需的信息。 任何帮助表示赞赏。
解决方法
这里的关键是计算每个指定段的开始和结束(在音频样本索引中)。
这可以通过先将毫秒转换为秒,然后再乘以音频的采样率来采样索引来完成。
但是通常,我建议在处理诸如此类的时间序列时使用Pandas datetime和timedelta功能。下面是实现此目的的一些示例代码:
import io
import pandas
import numpy
import librosa
def read_data(f,date_format):
df = pandas.read_csv(f,sep=',')
# Use proper pandas datatypes
df['Time'] = pandas.to_datetime(df['DateTimeStamp'],format=date_format)
df['Duration'] = pandas.to_timedelta(df['Duration ms'],unit='ms')
df = df.drop(columns=['DateTimeStamp','Duration ms'])
# Compute start and end time of each segment
# audio starts at time of first segment
first = df['Time'].iloc[0]
df['Start'] = df['Time'] - first
df['End'] = df['Start'] + df['Duration']
return df
def extract_segments(y,sr,segments):
# compute segment regions in number of samples
starts = numpy.floor(segments.Start.dt.total_seconds() * sr).astype(int)
ends = numpy.ceil(segments.End.dt.total_seconds() * sr).astype(int)
# slice the audio into segments
for start,end in zip(starts,ends):
audio_seg = y[start:end]
print('extracting audio segment:',len(audio_seg),'samples')
## Reproducible example
data = io.StringIO("""DateTimeStamp,Action,Duration ms
04/16/20 21:25:36:241,A,502
04/16/20 21:25:36:317,B,2253
04/16/20 21:25:36:734,X,118
04/16/20 21:25:36:837,C,10
04/16/20 21:25:37:537,D,797
04/16/20 21:25:37:606,70
04/16/20 21:25:37:874,1506
""")
segments = read_data(data,date_format="%m/%d/%y %H:%M:%S:%f")
print(segments)
path = librosa.util.example_audio_file()
y,sr = librosa.load(path,sr=16000,duration=10)
extract_segments(y,segments)
应该输出类似
Action Time Duration Start End
0 A 2020-04-16 21:25:36.241 00:00:00.502000 00:00:00 00:00:00.502000
1 B 2020-04-16 21:25:36.317 00:00:02.253000 00:00:00.076000 00:00:02.329000
2 X 2020-04-16 21:25:36.734 00:00:00.118000 00:00:00.493000 00:00:00.611000
3 C 2020-04-16 21:25:36.837 00:00:00.010000 00:00:00.596000 00:00:00.606000
4 D 2020-04-16 21:25:37.537 00:00:00.797000 00:00:01.296000 00:00:02.093000
5 X 2020-04-16 21:25:37.606 00:00:00.070000 00:00:01.365000 00:00:01.435000
6 A 2020-04-16 21:25:37.874 00:00:01.506000 00:00:01.633000 00:00:03.139000
extracting audio segment: 8032 samples
extracting audio segment: 36048 samples
extracting audio segment: 1888 samples
extracting audio segment: 160 samples
extracting audio segment: 12752 samples
extracting audio segment: 1120 samples
extracting audio segment: 24097 samples
,
import io
import pandas
import numpy as np
import librosa
import soundfile as sf
def read_data(annot,date_format):
df = pandas.read_csv(annot,')
# Use proper pandas datatypes
df['Time'] = pandas.to_datetime(df['DateTime'],format=date_format)
df['Duration'] = pandas.to_timedelta(df['Duration ms'],unit='ms')
df = df.drop(columns=['DateTime','Duration ms'])
# Compute start and end time of each segment
# audio starts at time of first segment
first = df['Time'].iloc[0]
df['Start'] = df['Time'] - first
df['End'] = df['Start'] + df['Duration']
return df
def extract_segments(y,segments):
# compute segment regions in number of samples
starts = np.floor(segments.Start.dt.total_seconds() * sr).astype(int)
ends = np.ceil(segments.End.dt.total_seconds() * sr).astype(int)
# slice the audio into segments
i = 0
for start,ends):
audio_seg = y[start:end]
print('extracting audio segment:','samples')
# file name string
# it takes 5 first character of Action
# and converts start and end time
file_name = str(segments.Activity[i][:5]) + \
'__' + \
str(segments.Start[i]).split('s ')[1].replace(':','_') + \
'__' + \
str(segments.End[i]).split('s ')[1].replace(':','_') + ".wav"
sf.write(file_name,audio_seg,sr)
i += 1
segments = read_data("annot.csv",date_format="%m/%d/%y %H:%M:%S:%f")
segments
y,sr = librosa.load("audio.wav",duration=2027)
extract_segments(y,segments)