在 stack overflow 上看到一个这样的求助帖子,贴主想用 webrtcvad and pydub 分离一段音频,好像是从一个人说的话里面,单独把单词提取出来。下面的有个大佬给了他一个解决方案就是用 vosk 来识别和分离,,,
可是我基本不懂 python ,但是 又非常想实现这个功能,因为最近学英语,一段音频里面有几百个单词,我想把他们按单词分段提取出来,然后导入到 anki 进行学习。
目前在用的是 ffmpeg 配合一个网上下载的 python 文件来实现,但是效果只能达到 80%左右,因为有些单词确实很难区分静音部分,比如 sector 这个单词,前面的 s 发音会被剪掉,然后今天发现了 vosk 这个识别库,不知道能不能做到 95%的完美。
原帖地址在下面,那位答主说的很详细了,可是我就是看不懂,希望能有大佬指点一二,
顺便也把我之前辛苦找到的一段也附上,现在我都不知道哪里找到的了,只是觉得这段文字解决了我很大的问题,就是还谈不上完美。。。。
https://stackoverflow.com/questions/64153590/audio-signal-split-at-word-level-boundary目前我在用的代码:
# -*- coding: utf-8 -*-
from pydub import AudioSegment
from pydub.silence import detect_silence
import os
import uuid
# 生成 guid
def GUID():
return str(uuid.uuid1()).replace("-", "")
# 分割文件
def SplitSound(filename, save_path, save_file_name, start_time, end_time, audio_type='mp3'):
if not os.path.exists(save_path):
try:
os.mkdir(save_path)
except Exception as e:
print(e)
sound = AudioSegment.from_file(filename, format=audio_type)
result = sound[start_time:end_time]
final_name = savePath
if not savePath.endswith("/"):
final_name = final_name + "/"
final_name = final_name + save_file_name
result.export(final_name, format=audio_type)
# AudioSegment.export(result, format=audioType)
def SplitSilence(file_name, save_path, audio_type='mp3'):
sound = AudioSegment.from_file(file_name, format=audio_type)
# print(len(sound))
# print(sound.max_possible_amplitude)
# start_end = detect_silence(sound,800,-57,1)
start_end = detect_silence(sound, 800, -57, 1)
# print(start_end)
start_point = 0
index = 1
for item in start_end:
if item[0] != 0:
# 取空白部分的中位数
end_point = (item[0] + item[1]) / 2
print("%d-%d" % (start_point, end_point))
SplitSound(file_name, save_path, str(index) + ".mp3", start_point, end_point)
index = index + 1
start_point = item[1]
# 处理最后一段音频
# sound.len
SplitSound(file_name, save_path, str(index) + ".mp3", start_point, len(sound))
# len(sound)
audioPath = "/Users/maptoca/Desktop/mp3 分割 /5.3.11.mp3"
savePath = "/Users/maptoca/Desktop/mp3 分割 /save5.3.11"
SplitSilence(audioPath, savePath)
下面是那位答主的代码,我不知道如何实现我想要的分离单个音频文件出来
import sys
import os
import subprocess
import json
import math
# tested with VOSK 0.3.15
import vosk
import librosa
import numpy
import pandas
def extract_words(res):
jres = json.loads(res)
if not 'result' in jres:
return []
words = jres['result']
return words
def transcribe_words(recognizer, bytes):
results = []
chunk_size = 4000
for chunk_no in range(math.ceil(len(bytes)/chunk_size)):
start = chunk_no*chunk_size
end = min(len(bytes), (chunk_no+1)*chunk_size)
data = bytes[start:end]
if recognizer.AcceptWaveform(data):
words = extract_words(recognizer.Result())
results += words
results += extract_words(recognizer.FinalResult())
return results
def main():
vosk.SetLogLevel(-1)
audio_path = sys.argv[1]
out_path = sys.argv[2]
model_path = 'vosk-model-small-de-0.15'
sample_rate = 16000
audio, sr = librosa.load(audio_path, sr=16000)
# convert to 16bit signed PCM, as expected by VOSK
int16 = numpy.int16(audio * 32768).tobytes()
# XXX: Model must be downloaded from
https://alphacephei.com/vosk/models #
https://alphacephei.com/vosk/models/vosk-model-small-de-0.15.zip if not os.path.exists(model_path):
raise ValueError(f"Could not find VOSK model at {model_path}")
model = vosk.Model(model_path)
recognizer = vosk.KaldiRecognizer(model, sample_rate)
res = transcribe_words(recognizer, int16)
df = pandas.DataFrame.from_records(res)
df = df.sort_values('start')
df.to_csv(out_path, index=False)
print('Word segments saved to', out_path)
if __name__ == '__main__':
main()