對話音頻生成

對話音頻生成 | 原創，AI翻譯

Home 2025.01

受到一段YouTube视频的启发，该视频讨论了DeepSeek-V3，我一直在尝试使用AI生成对话。我的目标是利用Google文本转语音和ffmpeg进行音频生成和拼接，创造出逼真的音频对话。以下代码概述了我目前模拟自然来回对话的方法。

提示

创建一个自然且延长的对话，至少100轮，由两位专家A和B进行。专家们应深入讨论一个特定话题，对话应来回流动。两位参与者都应提出问题、分享见解，并探讨主题的细微差别。格式如下：

[
    {
      "speaker": "A",
      "line": "嘿，我最近经常听到关于机器学习（ML）、深度学习（DL）和GPT的讨论。你能给我解释一下吗？"
    },
    {
      "speaker": "B",
      "line": "当然！我们从基础开始。机器学习是计算机科学的一个领域，系统通过数据学习以提高性能，而无需明确编程。可以把它想象成教计算机识别模式。"
    }
]

代码

import os
import json
import random
import subprocess
from google.cloud import texttospeech
import tempfile
import time
import argparse

# 固定输出目录用于对话
OUTPUT_DIRECTORY = "assets/conversations"
INPUT_DIRECTORY = "scripts/conversation"

def text_to_speech(text, output_filename, voice_name=None):
    print(f"生成音频：{output_filename}")
    try:
        client = texttospeech.TextToSpeechClient()
        synthesis_input = texttospeech.SynthesisInput(text=text)
        voice = texttospeech.VoiceSelectionParams(language_code="en-US", name=voice_name)
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            effects_profile_id=["small-bluetooth-speaker-class-device"]
        )
        
        retries = 5
        for attempt in range(1, retries + 1):
            try:
                response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
                with open(output_filename, 'wb') as out:
                    out.write(response.audio_content)
                print(f"音频内容写入 {output_filename}")
                return True
            except Exception as e:
                print(f"尝试 {attempt} 时出错：{e}")
                if attempt == retries:
                    print(f"经过 {retries} 次尝试后，音频生成失败。")
                    return False
                wait_time = 2 ** attempt
                print(f"等待 {wait_time} 秒后重试...")
                time.sleep(wait_time)
    except Exception as e:
        print(f"生成音频时发生错误 {output_filename}：{e}")
        return False

def process_conversation(filename):
    filepath = os.path.join(INPUT_DIRECTORY, filename)
    output_filename = os.path.join(OUTPUT_DIRECTORY, os.path.splitext(filename)[0] + ".mp3")

    if os.path.exists(output_filename):
        print(f"音频文件已存在：{output_filename}")
        return

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            conversation = json.load(f)
    except Exception as e:
        print(f"加载对话文件 {filename} 时出错：{e}")
        return

    temp_files = []
    
    voice_options = ["en-US-Journey-D", "en-US-Journey-F", "en-US-Journey-O"]
    voice_name_A = random.choice(voice_options)
    voice_name_B = random.choice(voice_options)
    while voice_name_A == voice_name_B:
        voice_name_B = random.choice(voice_options)

    for idx, line_data in enumerate(conversation):
        speaker = line_data.get("speaker")
        line = line_data.get("line")
        if not line:
            continue
        temp_file = os.path.join(OUTPUT_DIRECTORY, f"temp_{idx}.mp3")
        temp_files.append(temp_file)
        
        voice_name = None
        if speaker == "A":
            voice_name = voice_name_A
        elif speaker == "B":
            voice_name = voice_name_B
        
        if not text_to_speech(line, temp_file, voice_name=voice_name):
            print(f"生成 {filename} 的第 {idx+1} 行音频失败")
            # 清理临时文件
            for temp_file_to_remove in temp_files:
                if os.path.exists(temp_file_to_remove):
                    os.remove(temp_file_to_remove)
            return

    if not temp_files:
        print(f"没有为 {filename} 生成音频")
        return

    # 使用 ffmpeg 拼接
    concat_file = os.path.join(OUTPUT_DIRECTORY, "concat.txt")
    with open(concat_file, 'w') as f:
        for temp_file in temp_files:
            f.write(f"file '{os.path.abspath(temp_file)}'\n")
    
    try:
        subprocess.run(
            ['ffmpeg', '-f', 'concat', '-safe', '0', '-i', concat_file, '-c', 'copy', output_filename],
            check=True,
            capture_output=True
        )
        print(f"成功拼接音频到 {output_filename}")
    except subprocess.CalledProcessError as e:
        print(f"拼接音频时出错：{e.stderr.decode()}")
    finally:
        os.remove(concat_file)
        for temp_file in temp_files:
            os.remove(temp_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="处理对话JSON文件以生成音频。")
    args = parser.parse_args()

    os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)

    for filename in os.listdir(INPUT_DIRECTORY):
        if filename.endswith(".json"):
            process_conversation(filename)

封面

ffmpeg -i deepseek.jpg -vf "crop=854:480" deepseek_480p_cropped.jpg

视频

ffmpeg -loop 1 -i deepseek.jpg -i deepseek.mp3 -c:v libx264 -tune stillimage -c:a aac -b:a 192k -shortest output_video.mp4

Back Donate