CivilのKeTCindy教室: 音声(mp3)のセンテンスのテキスト出力及び各センテンスのmp3出力(python,windows10)

[F:\英語の学習-実習\mp3-2-text\t1-mp3-to-sentense-mp3.py](作業場所)

■音声(mp3)のセンテンスのテキスト出力及び各センテンスのmp3出力の試み
上手くいかない場合があります。
改善が必要ですが覚書として記録したものです。

import speech_recognition as sr

from pydub import AudioSegment

from pydub.silence import split_on_silence

import os

# 一括でファイル名を定義

mp3_file = "practice.mp3"

wav_file = "converted.wav"

# ディレクトリ部分とファイル名部分に分ける

basename, ext = os.path.splitext(mp3_file)

#print(basename) # "practice"

#print(ext) # ".mp3"

dir_name, file_name = os.path.split(mp3_file)

#print(dir_name) # 空文字列 ""

#print(file_name) # "practice.mp3"

# ファイル名から拡張子を除去

tmp2 = os.path.splitext(file_name)[0] #"practice"

txt_file = tmp2 + "/" + tmp2 + ".txt" #"practice/practice.txt"で保存

directory = os.path.dirname(txt_file) #"practice"

# ディレクトリが存在しない場合は作成

if not os.path.exists(directory):

os.makedirs(directory)

# 無音の長さ（秒単位）を設定（チャンク同士の間隔）

#silence_duration = 2 # 1 デフォルトの無音時間（秒）

silence_duration = 1 #1 英文のチャンク間の無音時間

# MP3ファイルをWAVに変換する

sound = AudioSegment.from_mp3(mp3_file)

sound.export(wav_file, format="wav")

# 音声タイプを選択 ('e-only' for 英文だけ, 'j-only' for 和訳だけ)

audio_type = 'e-only' # 'e-only' または 'j-only' に設定

# 無音を作成する関数

def generate_silence(duration_sec):

return AudioSegment.silent(duration=duration_sec * 1000) # ミリ秒に変換

# 音声ファイルを分割する（無音部分で区切る）

def split_audio(file_path):

sound = AudioSegment.from_wav(file_path)

chunks = split_on_silence(

sound,

min_silence_len=1200, # 500->1200無音とみなす最小の長さ（ミリ秒）[この値を適宜変更]

silence_thresh=sound.dBFS - 14, # 無音とみなす音の閾値

keep_silence=500 # 各チャンクの先頭と末尾に無音を残す（ミリ秒）

)

return chunks

# 文章の末尾に句点やピリオドを追加し、英文をキャピタライズする関数

def format_sentence(sentence, is_english=False):

sentence = sentence.strip() # 余分なスペースを削除

if is_english:

sentence = sentence.capitalize() # 英文の先頭を大文字に変換

if sentence and sentence[-1] not in ['.', '?', '!', '。', '？', '！']:

sentence += '.' # 英文の文末にピリオドを追加

else:

if sentence and sentence[-1] not in ['。', '？', '！']:

sentence += '。' # 和訳の文末に句読点を追加

return sentence

# 音声認識の準備

recognizer = sr.Recognizer()

# WAVファイルをチャンクごとに分割

audio_chunks = split_audio(wav_file)

# 変換したテキストを保存するリスト

recognized_sentences = []

# 各チャンクをテキストに変換

chunk_files = [] # 生成されたチャンクファイルのリスト

for i, chunk in enumerate(audio_chunks):

chunk_filename = f"chunk{i}.wav"

# 各チャンクを保存

chunk.export(chunk_filename, format="wav")

chunk_files.append(chunk_filename) # 作成されたチャンクファイルをリストに追加

with sr.AudioFile(chunk_filename) as source:

audio = recognizer.record(source)

# 音声をテキストに変換

try:

# 音声の種類に応じて処理を変更

if audio_type == 'e-only':

# 英文の場合

text = recognizer.recognize_google(audio, language="en-US")

formatted_sentence = format_sentence(text, is_english=True)

elif audio_type == 'j-only':

# 和訳の場合

text = recognizer.recognize_google(audio, language="ja-JP")

formatted_sentence = format_sentence(text, is_english=False)

# 認識結果をリストに追加

recognized_sentences.append(formatted_sentence)

except sr.UnknownValueError:

print(f"チャンク{i} は認識できませんでした。")

except sr.RequestError as e:

print(f"Googleサービスへのリクエストに失敗しました; {e}")

# TXTファイルに書き込む処理

with open(txt_file, mode='w', encoding='utf-8') as txtfile:

for sentence in recognized_sentences:

txtfile.write(sentence + '\n')

print(f"出力が完了しました。結果は {txt_file} に保存されました。")

# 生成された各チャンクファイルをMP3に変換する

# ファイル名はmp3_fileのbasenameに"-"と3桁の連番を付ける。例 practice-001.mp3

output_dir=directory

# ディレクトリが存在しない場合は作成

if not os.path.exists(output_dir):

os.makedirs(output_dir)

# 各チャンクファイルをMP3に変換して保存

for i, chunk in enumerate(audio_chunks):

mp3_filename = f"{basename}-{i+1:03d}.mp3"

output_path = os.path.join(output_dir, mp3_filename) # ディレクトリに保存

# チャンクをMP3として保存

chunk.export(output_path, format="mp3")

print(f"{output_path} に保存されました。")

# 生成されたWAVファイルを削除

if os.path.exists(wav_file):

os.remove(wav_file)

print(f"{wav_file} を削除しました。")

# 生成されたチャンクファイルを削除

for chunk_file in chunk_files:

if os.path.exists(chunk_file):

os.remove(chunk_file)

print(f"{chunk_file} を削除しました。")

メニュー

音声(mp3)のセンテンスのテキスト出力及び各センテンスのmp3出力(python,windows10)