TTS：如何将文本转换为 SSML？

问题描述

我的目标是让设备用人声说出文本。所以我使用的是 Google 的 Text-to-Speech API。

这是我的代码的样子：

package ch.yourclick.kitt;

import android.media.MediaPlayer;
import android.os.Build;
import android.os.Bundle;
import android.os.StrictMode;
import android.view.View;

import androidx.annotation.RequiresApi;
import androidx.appcompat.app.AppCompatActivity;
import androidx.viewpager.widget.ViewPager;

import com.google.android.material.floatingactionbutton.FloatingActionButton;
import com.google.android.material.snackbar.Snackbar;
import com.google.android.material.tabs.TabLayout;
import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.texttospeech.v1.AudioConfig;
import com.google.cloud.texttospeech.v1.AudioEncoding;
import com.google.cloud.texttospeech.v1.SsmlVoiceGender;
import com.google.cloud.texttospeech.v1.SynthesisInput;
import com.google.cloud.texttospeech.v1.SynthesizeSpeechResponse;
import com.google.cloud.texttospeech.v1.TextToSpeechClient;
import com.google.cloud.texttospeech.v1.TextToSpeechSettings;
import com.google.cloud.texttospeech.v1.VoiceSelectionParams;
import com.google.common.html.HtmlEscapers;
import com.google.protobuf.ByteString;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;

import ch.yourclick.kitt.ui.main.SectionsPagerAdapter;

public class MainActivity extends AppCompatActivity implements View.OnClickListener {

    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);
        SectionsPagerAdapter sectionsPagerAdapter = new SectionsPagerAdapter(this,getSupportFragmentManager());
        ViewPager viewPager = findViewById(R.id.view_pager);
        viewPager.setAdapter(sectionsPagerAdapter);
        TabLayout tabs = findViewById(R.id.tabs);
        tabs.setupWithViewPager(viewPager);
        FloatingActionButton fab = findViewById(R.id.fab);

        fab.setonClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View view) {
                Snackbar.make(view,"Replace with your own action",Snackbar.LENGTH_LONG)
                        .setAction("Action",null).show();
            }
        });
    }


    @RequiresApi(api = Build.VERSION_CODES.LOLLIPOP)
    @Override
    public void onClick(View view) {
        int SDK_INT = android.os.Build.VERSION.SDK_INT;
        if (SDK_INT > 8)
        {
            StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder()
                    .permitAll().build();
            StrictMode.setThreadPolicy(policy);

            try {
                this.hello();
            } catch (Exception e) {
                e.printstacktrace();
            }

        }
    }

    /** Demonstrates using the Text-to-Speech API. */
    @RequiresApi(api = Build.VERSION_CODES.KITKAT)
    public void hello() throws Exception {
        InputStream stream = getResources().openRawResource(R.raw.credential); // R.raw.credential is credential.json
        GoogleCredentials credentials = GoogleCredentials.fromStream(stream);
        TextToSpeechSettings textToSpeechSettings =
                TextToSpeechSettings.newBuilder()
                        .setCredentialsProvider(
                                FixedCredentialsProvider.create(credentials)
                        ).build()
                ;

        // Instantiates a client
        try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create(textToSpeechSettings)) {

            // Set the text input to be synthesized
            SynthesisInput input = SynthesisInput.newBuilder().setText("<speak>Step 1,take a deep breath. <break time=\"2000ms\"/> Hello?</speak>").build();

            // Build the voice request,select the language code ("en-US") and the ssml voice gender
            // ("neutral")
            VoiceSelectionParams voice =
                    VoiceSelectionParams.newBuilder()
                            .setLanguageCode("en-US")
                            .setSsmlGender(SsmlVoiceGender.NEUTRAL)
                            .build();

            // Select the type of audio file you want returned
            AudioConfig audioConfig =
                    AudioConfig.newBuilder().setAudioEncoding(AudioEncoding.MP3).build();

            // Perform the text-to-speech request on the text input with the selected voice parameters and
            // audio file type
            SynthesizeSpeechResponse response = textToSpeechClient.synthesizeSpeech(input,voice,audioConfig);

            // Get the audio contents from the response
            ByteString audioContents = response.getAudioContent();

            // Write the response to the output file.
            try (FileOutputStream out = new FileOutputStream(getFilesDir() + "/output.mp3")) {
                System.out.println(getFilesDir());
                out.write(audioContents.toByteArray());
                System.out.println("Audio content written to file \"output.mp3\"");
            }

            String myFile = getFilesDir() + "/output.mp3";
            MediaPlayer mediaPlayer = new MediaPlayer();
            mediaPlayer.setDataSource(myFile);
            mediaPlayer.prepare();
            mediaPlayer.start();
        }
    }

}

正如您在代码中看到的，文本应该是“第 1 步，深呼吸。第 2 步......你好？你在吗？”

嗯，我收到了音频，但听起来不太自然，它以“少说……”开头，这不是重点。

它可能不起作用，因为我需要将该纯文本转换为 SSML。但是，我该怎么做？

我使用的是 Android Studio。

更新

以下方法应该可以正常工作：

public static String textToSsml(String inputFile) throws Exception {

  // Read lines of input file
  String rawLines = new String(Files.readAllBytes(Paths.get(inputFile)));

  // Replace special characters with HTML Ampersand Character Codes
  // These codes prevent the API from confusing text with SSML tags
  // For example,'<' --> '&lt;' and '&' --> '&amp;'
  String escapedLines = HtmlEscapers.htmlEscaper().escape(rawLines);

  // Convert plaintext to SSML
  // Tag SSML so that there is a 2 second pause between each address
  String expandednewline = escapedLines.replaceAll("\\n","\n<break time='2s'/>");
  String ssml = "<speak>" + expandednewline + "</speak>";

  // Return the concatenated String of SSML
  return ssml;
}

参考：https://cloud.google.com/text-to-speech/docs/ssml-tutorial?hl=en#personalizing_synthetic_audio

我仍然不知道如何使用这种方法。但这就是我尝试过的：

package ch.yourclick.kitt;

import android.media.MediaPlayer;
import android.os.Build;
import android.os.Bundle;
import android.os.StrictMode;
import android.view.View;

import androidx.annotation.RequiresApi;
import androidx.appcompat.app.AppCompatActivity;
import androidx.viewpager.widget.ViewPager;

import com.google.android.material.floatingactionbutton.FloatingActionButton;
import com.google.android.material.snackbar.Snackbar;
import com.google.android.material.tabs.TabLayout;
import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.texttospeech.v1.AudioConfig;
import com.google.cloud.texttospeech.v1.AudioEncoding;
import com.google.cloud.texttospeech.v1.SsmlVoiceGender;
import com.google.cloud.texttospeech.v1.SynthesisInput;
import com.google.cloud.texttospeech.v1.SynthesizeSpeechResponse;
import com.google.cloud.texttospeech.v1.TextToSpeechClient;
import com.google.cloud.texttospeech.v1.TextToSpeechSettings;
import com.google.cloud.texttospeech.v1.VoiceSelectionParams;
import com.google.common.html.HtmlEscapers;
import com.google.protobuf.ByteString;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;

import ch.yourclick.kitt.ui.main.SectionsPagerAdapter;

public class MainActivity extends AppCompatActivity implements View.OnClickListener {

    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);
        SectionsPagerAdapter sectionsPagerAdapter = new SectionsPagerAdapter(this,null).show();
            }
        });
    }


    @RequiresApi(api = Build.VERSION_CODES.LOLLIPOP)
    @Override
    public void onClick(View view) {
        int SDK_INT = android.os.Build.VERSION.SDK_INT;
        if (SDK_INT > 8)
        {
            StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder()
                    .permitAll().build();
            StrictMode.setThreadPolicy(policy);

            try {
                this.hello();
            } catch (Exception e) {
                e.printstacktrace();
            }

        }
    }

    /** Demonstrates using the Text-to-Speech API. */
    @RequiresApi(api = Build.VERSION_CODES.KITKAT)
    public void hello() throws Exception {
        InputStream stream = getResources().openRawResource(R.raw.credential); // R.raw.credential is credential.json
        GoogleCredentials credentials = GoogleCredentials.fromStream(stream);
        TextToSpeechSettings textToSpeechSettings =
                TextToSpeechSettings.newBuilder()
                        .setCredentialsProvider(
                                FixedCredentialsProvider.create(credentials)
                        ).build()
                ;

        // Instantiates a client
        try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create(textToSpeechSettings)) {

            // Set the text input to be synthesized
            SynthesisInput input = SynthesisInput.newBuilder().setText("Step 1 \n take a deep breath").build();

            // Build the voice request,audioConfig);

            // Get the audio contents from the response
            ByteString audioContents = response.getAudioContent();


            // Write the response to the output file.
            try (FileOutputStream out = new FileOutputStream(getFilesDir() + "/output.mp3")) {
                System.out.println(getFilesDir());
                out.write(audioContents.toByteArray());
                System.out.println("Audio content written to file \"output.mp3\"");
            }

            String myFile = getFilesDir() + "/output.mp3";
            MediaPlayer mediaPlayer = new MediaPlayer();
            mediaPlayer.setDataSource(myFile);
            mediaPlayer.prepare();
            mediaPlayer.start();

            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
                textToSsml(getFilesDir() + "/output.mp3");
            }
        }
    }

    @RequiresApi(api = Build.VERSION_CODES.O)
    public static String textToSsml(String inputFile) throws Exception {

        // Read lines of input file
        String rawLines = new String(Files.readAllBytes(Paths.get(inputFile)));

        // Replace special characters with HTML Ampersand Character Codes
        // These codes prevent the API from confusing text with SSML tags
        // For example,'<' --> '&lt;' and '&' --> '&amp;'
        String escapedLines = HtmlEscapers.htmlEscaper().escape(rawLines);

        // Convert plaintext to SSML
        // Tag SSML so that there is a 2 second pause between each address
        String expandednewline = escapedLines.replaceAll("\\n","\n<break time='2s'/>");
        String ssml = "<speak>" + expandednewline + "</speak>";

        // Return the concatenated String of SSML
        return ssml;
    }

}

嗯，目标是音频将是：“第 1 步”（等待 2 秒）“深呼吸” 但在我的情况下，输出是“第 1 步深呼吸”，因此缺少 2 秒的停顿。我做错了什么？

解决方法

您说，“这可能不起作用，因为我需要将该明文转换为 SSML。”

但这是不正确的。它已经“是”ssml，因为它包含 ssml 标签。

在您的原始代码中，您像这样定义输入：

SynthesisInput input = SynthesisInput.newBuilder().setText("<speak>Step 1,take a deep breath. <break time=\"2000ms\"/> Hello?</speak>").build();

字符串是“第 1 步，深呼吸。你好？"

术语“纯文本”令人困惑。

它是一个字符串，它是一个字符串。问题是它打算如何解释。

Google 需要知道是将此字符串解释为纯文本，还是其他“标记语言”，例如 ssml。

为了告诉 Google 您上传的字符串应该被解释为 ssml，您必须使用 setSsml 方法。

但是，您没有使用 setSsml 方法，因此 Google 没有将此 String 解释为 ssml。

试试这个：

String myString = "<speak>Step 1,take a deep breath. <break time=\"2000ms\"/> Hello?</speak>"

SynthesisInput input = SynthesisInput.newBuilder().setSsml(myString).build();

您想先说“第 1 步”，然后使用 thread.sleep(2000);，然后说“深呼吸”

android android java java ssml text-to-speech