问题描述
我正在研究语音识别系统,但在提取信号特征时遇到问题
audio_path = r"C:\Users\Salma\Downloads\TORGO"
files = os.listdir(audio_path)
all_wave = []
all_label = []
mfcc_features = []
ignored = {"Notes","phn_arrayMic","amps","rawpos","wavall","pos","alignment","Cpcmd","EMA","VIDEO","wav_headMic","normpos","log"}
for file in files:
for subfile in os.listdir(audio_path + '/'):
for session in [s for s in os.listdir(audio_path + '/' + subfile +'/')if s not in ignored]:
for subsession in [x for x in os.listdir(audio_path + '/'+ subfile +'/'+ session + '/')if x not in ignored]:
for data in [y for y in os.listdir(audio_path + '/'+ subfile +'/'+ session + '/' +subsession +'/') if y not in ignored]:
waves = [f for f in os.listdir(audio_path + '/' + subfile +'/'+ session +'/' + subsession +'/' + data + '/')if f.endswith('.wav')]
labels = [L for L in os.listdir(audio_path + '/' + subfile +'/'+ session +'/' + subsession + '/' + data + '/')if L.endswith('.txt')]
for wav in waves:
samples,sample_rate = librosa.load(audio_path + '/' + subfile +'/'+ session + '/'+subsession +'/'+ data +'/'+ wav,sr = 16000)
samples = librosa.resample(samples,sample_rate,16000)
mfcc = librosa.feature.mfcc(samples,n_mfcc=13)
mfcc = mfcc.T
all_wave.append(samples)
mfcc_features.append(mfcc.tolist())
for label in labels:
file_path = audio_path + '/' + subfile + '/' + session + '/' + subsession + '/' + data +'/'+ label
l = open(file_path,"r")
for target in l:
all_label.append(target.strip())
如果我删除了 mfcc = librosa.feature.mfcc,代码工作正常
这里是返回错误:
ValueError Traceback (most recent call last)
<ipython-input-13-fd45bf295b2c> in <module>
12 samples,sr = 16000)
13 samples = librosa.resample(samples,16000)
---> 14 mfcc = librosa.feature.mfcc(samples,n_mfcc=13)
15 mfcc = mfcc.T
16 all_wave.append(samples)
C:\Users\Salma\Anaconda3\lib\site-packages\librosa\feature\spectral.py in mfcc(y,sr,S,n_mfcc,dct_type,norm,lifter,**kwargs)
1850
1851 if S is None:
-> 1852 S = power_to_db(melspectrogram(y=y,sr=sr,**kwargs))
1853
1854 M = scipy.fftpack.dct(S,axis=0,type=dct_type,norm=norm)[:n_mfcc]
C:\Users\Salma\Anaconda3\lib\site-packages\librosa\feature\spectral.py in melspectrogram(y,n_fft,hop_length,win_length,window,center,pad_mode,power,**kwargs)
2003 window=window,2004 center=center,-> 2005 pad_mode=pad_mode,2006 )
2007
C:\Users\Salma\Anaconda3\lib\site-packages\librosa\core\spectrum.py in _spectrogram(y,pad_mode)
2517 center=center,2518 window=window,-> 2519 pad_mode=pad_mode,2520 )
2521 )
C:\Users\Salma\Anaconda3\lib\site-packages\librosa\core\spectrum.py in stft(y,dtype,pad_mode)
226 )
227
--> 228 y = np.pad(y,int(n_fft // 2),mode=pad_mode)
229
230 elif n_fft > y.shape[-1]:
<__array_function__ internals> in pad(*args,**kwargs)
C:\Users\Salma\Anaconda3\lib\site-packages\numpy\lib\arraypad.py in pad(array,pad_width,mode,**kwargs)
817 raise ValueError(
818 "can't extend empty axis {} using modes other than "
--> 819 "'constant' or 'empty'".format(axis)
820 )
821 # passed,don't need to do anything more as _pad_simple already
ValueError: can't extend empty axis 0 using modes other than 'constant' or 'empty'
所以这里可能是哪里出了问题,提前致谢
解决方法
我遇到了同样的问题,.m4a 音频持续时间为 280 秒,但只有 160KB。 我觉得这个音频很可能坏了,你最好检查一下音频。
,我认为您可以轻松实现 MFCC 并且 here 具有完整细节的完美实现