π₯οΈπ€ Π’ΡΠ°Π½ΡΠΊΡΠΈΠ±Π°ΡΠΈΡ Π²ΠΈΠ΄Π΅ΠΎ ΠΈ ΡΠΎΠ·Π΄Π°Π½ΠΈΠ΅ ΡΡΠ±ΡΠΈΡΡΠΎΠ² Ρ ΠΏΠΎΠΌΠΎΡΡΡ Whisper, FFmpeg ΠΈ Python
Π‘Π»Π΅Π΄ΡΡ Π½Π°ΡΠ΅ΠΌΡ ΠΏΠΎΡΠ°Π³ΠΎΠ²ΠΎΠΌΡ ΡΡΠΊΠΎΠ²ΠΎΠ΄ΡΡΠ²Ρ, Π²Ρ ΡΠΌΠΎΠΆΠ΅ΡΠ΅ Π°Π²ΡΠΎΠΌΠ°ΡΠΈΡΠ΅ΡΠΊΠΈ ΡΡΠ°Π½ΡΠΊΡΠΈΠ±ΠΈΡΠΎΠ²Π°ΡΡ Π°ΡΠ΄ΠΈΠΎ ΠΈ Π΄ΠΎΠ±Π°Π²Π»ΡΡΡ ΡΡΠ±ΡΠΈΡΡΡ ΠΊ ΡΠ²ΠΎΠΈΠΌ Π²ΠΈΠ΄Π΅ΠΎ Π²ΡΠ΅Π³ΠΎ Π·Π° Π½Π΅ΡΠΊΠΎΠ»ΡΠΊΠΎ ΠΌΠΈΠ½ΡΡ.
ΠΠ΅ΠΎΠ±Ρ ΠΎΠ΄ΠΈΠΌΡΠ΅ ΠΈΠ½ΡΡΡΡΠΌΠ΅Π½ΡΡ: Python, Whisper, FFmpeg.
ΠΠ°ΡΡΡΠΎΠΉΠΊΠ° ΡΠ°Π±ΠΎΡΠ΅Π³ΠΎ ΠΏΡΠΎΡΡΡΠ°Π½ΡΡΠ²Π°
Π‘ΠΎΠ·Π΄Π°Π΄ΠΈΠΌ ΡΠ°Π±ΠΎΡΡΡ ΠΏΠ°ΠΏΠΊΡ
mkdir open-ai-whisper-ffmpeg
ΠΠ΅ΡΠ΅ΠΉΠ΄Π΅ΠΌ Π² ΠΏΠ°ΠΏΠΊΡ ΠΏΡΠΎΠ΅ΠΊΡΠ° ΠΈ ΡΠΎΠ·Π΄Π°Π΄ΠΈΠΌ Π²ΠΈΡΡΡΠ°Π»ΡΠ½ΠΎΠ΅ ΠΎΠΊΡΡΠΆΠ΅Π½ΠΈΠ΅:
cd open-ai-whisper-ffmpeg python3 -m venv .venv source .venv/bin/activate
Π£ΡΡΠ°Π½ΠΎΠ²ΠΈΠΌ Π½Π΅ΠΎΠ±Ρ ΠΎΠ΄ΠΈΠΌΡΠ΅ ΠΏΠ°ΠΊΠ΅ΡΡ Π΄Π»Ρ OpenAI Whisper:
pip install git+https://github.com/m-bain/whisperx.git
Π’ΡΠ°Π½ΡΠΊΡΠΈΠ±Π°ΡΠΈΡ Π²ΠΈΠ΄Π΅ΠΎ
Π‘Π½Π°ΡΠ°Π»Π° ΡΠΎΠ·Π΄Π°Π΄ΠΈΠΌ Π½ΠΎΠ²ΡΠΉ ΡΠ°ΠΉΠ» Python β main.py
.
touch main.py
Π Π΄ΠΎΠ±Π°Π²ΠΈΠΌ Π² Π½Π΅Π³ΠΎ ΠΊΠΎΠ΄:
from datetime import timedelta import os import whisperx def transcribe_video(input_video): batch_size = 32 compute_type = "float32" device = "cpu" model = whisperx.load_model("large-v2", device=device, compute_type=compute_type) audio = whisperx.load_audio(input_video) result = model.transcribe(audio, batch_size=batch_size, language="en") model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) segments = result["segments"] # if srt file exists, delete it if os.path.exists("subtitles.srt"): os.remove("subtitles.srt") for index, segment in enumerate(segments): startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000' endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000' text = segment['text'] print(text) segment = f"{index + 1}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n" srtFilename = os.path.join(f"subtitles.srt") with open(srtFilename, 'a', encoding='utf-8') as srtFile: srtFile.write(segment) return srtFilename def main(): input_video_path = "input.mp4" transcribe_video(input_video_path) main()
ΠΠ°Π²Π°ΠΉΡΠ΅ ΡΠ°ΡΡΠΌΠΎΡΡΠΈΠΌ, ΡΡΠΎ ΠΌΡ Π΄Π΅Π»Π°Π΅ΠΌ Π² ΠΏΡΠΈΠ²Π΅Π΄Π΅Π½Π½ΠΎΠΌ Π²ΡΡΠ΅ ΠΊΠΎΠ΄Π΅. Π ΡΡΠΈΡ
ΡΡΡΠΎΠΊΠ°Ρ
ΠΌΡ ΠΈΠΌΠΏΠΎΡΡΠΈΡΡΠ΅ΠΌ Π½Π΅ΠΎΠ±Ρ
ΠΎΠ΄ΠΈΠΌΡΠ΅ ΠΏΠ°ΠΊΠ΅ΡΡ Π΄Π»Ρ ΡΠ°Π±ΠΎΡΡ: whisperx
Π΄Π»Ρ Π·Π°Π³ΡΡΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ Whisper, os
Π΄Π»Ρ ΠΏΠΎΠ»ΡΡΠ΅Π½ΠΈΡ ΠΏΡΡΠΈ ΠΊ ΡΠ°ΠΉΠ»Ρ ΡΡΠ±ΡΠΈΡΡΠΎΠ² ΠΈ timedelta
Π΄Π»Ρ ΡΠΎΡΠΌΠ°ΡΠΈΡΠΎΠ²Π°Π½ΠΈΡ ΡΠ΅ΠΊΡΡΠ° Π²ΡΠ΅ΠΌΠ΅Π½Π½ΡΡ
ΠΌΠ΅ΡΠΎΠΊ.
from datetime import timedelta import os import whisperx
ΠΠ΄Π΅ΡΡ ΠΌΡ ΠΎΠΏΡΠ΅Π΄Π΅Π»ΠΈΠ»ΠΈ ΡΡΠ½ΠΊΡΠΈΡ, ΠΊΠΎΡΠΎΡΠ°Ρ ΠΏΡΠΈΠ½ΠΈΠΌΠ°Π΅Ρ Π²Ρ ΠΎΠ΄Π½ΠΎΠ΅ Π²ΠΈΠ΄Π΅ΠΎ, Π·Π°Π³ΡΡΠΆΠ°Π΅Ρ ΠΌΠΎΠ΄Π΅Π»Ρ Whisper large-v2, ΡΠΊΠ°Π·ΡΠ²Π°Π΅Ρ ΡΠΈΠΏ Π²ΡΡΠΈΡΠ»Π΅Π½ΠΈΠΉ ΠΈ Π½Π°ΡΡΡΠ°ΠΈΠ²Π°Π΅Ρ ΠΌΠΎΠ΄Π΅Π»Ρ Π½Π° ΠΈΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅ CPU Π²ΠΌΠ΅ΡΡΠΎ GPU.
ΠΠΎΡΠ»Π΅ ΡΡΠΎΠ³ΠΎ ΡΡΠ½ΠΊΡΠΈΡ Π·Π°Π³ΡΡΠΆΠ°Π΅Ρ Π°ΡΠ΄ΠΈΠΎ Π² ΠΌΠΎΠ΄Π΅Π»Ρ, Π·Π°ΡΠ΅ΠΌ ΡΡΠ°Π½ΡΠΊΡΠΈΠ±ΠΈΡΡΠ΅Ρ ΠΈ Π²ΠΎΠ·Π²ΡΠ°ΡΠ°Π΅Ρ ΡΠ΅ΠΊΡΡ Ρ Π²ΡΠ΅ΠΌΠ΅Π½Π½ΡΠΌΠΈ ΠΌΠ΅ΡΠΊΠ°ΠΌΠΈ.
def transcribe_video(input_video): batch_size = 32 compute_type = "float32" device = "cpu" model = whisperx.load_model("large-v2", device=device, compute_type=compute_type) audio = whisperx.load_audio(input_video) result = model.transcribe(audio, batch_size=batch_size, language="en") model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) segments = result["segments"]
ΠΠΎΡΠ»Π΅ ΡΡΠΎΠ³ΠΎ ΡΡΠ½ΠΊΡΠΈΡ ΠΏΡΠΎΡ ΠΎΠ΄ΠΈΡ ΠΏΠΎ Π²ΡΠ΅ΠΌ ΠΏΠΎΠ»ΡΡΠ΅Π½Π½ΡΠΌ ΠΌΠΎΠ΄Π΅Π»ΡΡ ΡΠ΅Π·ΡΠ»ΡΡΠ°ΡΠ°ΠΌ, ΠΏΡΠ΅ΠΎΠ±ΡΠ°Π·ΡΠ΅Ρ ΠΈΡ Π² ΡΠΎΡΠΌΠ°Ρ .srt ΠΈ Π΄ΠΎΠ±Π°Π²Π»ΡΠ΅Ρ ΠΊΠ°ΠΆΠ΄ΡΠΉ ΡΠ»Π΅ΠΌΠ΅Π½Ρ ΡΠ»ΠΎΠ²Π° Π² ΡΠ°ΠΉΠ» subtitles.srt.
# if srt file exists, delete it if os.path.exists("subtitles.srt"): os.remove("subtitles.srt") for index, segment in enumerate(segments): startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000' endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000' text = segment['text'] print(text) segment = f"{index + 1}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n" srtFilename = os.path.join(f"subtitles.srt") with open(srtFilename, 'a', encoding='utf-8') as srtFile: srtFile.write(segment) return srtFilename
ΠΠΎΠ±Π°Π²Π»Π΅Π½ΠΈΠ΅ ΡΡΠ±ΡΠΈΡΡΠΎΠ² ΠΊ Π²ΠΈΠ΄Π΅ΠΎ
ΠΠ°Π»Π΅Π΅ ΠΌΡ Π·Π°Π³ΡΡΠΆΠ°Π΅ΠΌ subtitles.srt Π² Π²ΠΈΠ΄Π΅ΠΎ Ρ ΠΏΠΎΠΌΠΎΡΡΡ FFmpeg. Π ΠΈΡΠΎΠ³Π΅ ΠΏΠΎΠ»ΡΡΠ°Π΅ΠΌ ΡΠ»Π΅Π΄ΡΡΡΠΈΠΉ ΡΠΊΡΠΈΠΏΡ:
from datetime import timedelta import os import whisperx import subprocess def transcribe_video(input_video): batch_size = 32 compute_type = "float32" device = "cpu" model = whisperx.load_model("large-v2", device=device, compute_type=compute_type) audio = whisperx.load_audio(input_video) result = model.transcribe(audio, batch_size=batch_size, language="en") model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) segments = result["segments"] # if srt file exists, delete it if os.path.exists("subtitles.srt"): os.remove("subtitles.srt") for index, segment in enumerate(segments): startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000' endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000' text = segment['text'] print(text) segment = f"{index + 1}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n" srtFilename = os.path.join(f"subtitles.srt") with open(srtFilename, 'a', encoding='utf-8') as srtFile: srtFile.write(segment) return srtFilename def add_srt_to_video(input_video, output_file): # FFmpeg command subtitles_file = 'subtitles.srt' # FFmpeg command ffmpeg_command = f"""ffmpeg -i {input_video} -vf "subtitles={subtitles_file}:force_style='FontName=Arial,FontSize=10,PrimaryColour=&HFFFFFF,OutlineColour=&H000000,BorderStyle=3,Outline=1,Shadow=1,Alignment=2,MarginV=10'" -c:a copy {output_file} -y """ # Run the FFmpeg command subprocess.run(ffmpeg_command, shell=True) <u>def main():</u> input_video_path = "input.mp4" output_file = "output.mp4" transcribe_video(input_video_path) add_srt_to_video(input_video_path, output_file) main()