问题描述
我的目标是让设备用人声说出文本。所以我使用的是 Google 的 Text-to-Speech API。
这是我的代码的样子:
package ch.yourclick.kitt;
import android.media.MediaPlayer;
import android.os.Build;
import android.os.Bundle;
import android.os.StrictMode;
import android.view.View;
import androidx.annotation.RequiresApi;
import androidx.appcompat.app.AppCompatActivity;
import androidx.viewpager.widget.ViewPager;
import com.google.android.material.floatingactionbutton.FloatingActionButton;
import com.google.android.material.snackbar.Snackbar;
import com.google.android.material.tabs.TabLayout;
import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.texttospeech.v1.AudioConfig;
import com.google.cloud.texttospeech.v1.AudioEncoding;
import com.google.cloud.texttospeech.v1.SsmlVoiceGender;
import com.google.cloud.texttospeech.v1.SynthesisInput;
import com.google.cloud.texttospeech.v1.SynthesizeSpeechResponse;
import com.google.cloud.texttospeech.v1.TextToSpeechClient;
import com.google.cloud.texttospeech.v1.TextToSpeechSettings;
import com.google.cloud.texttospeech.v1.VoiceSelectionParams;
import com.google.common.html.HtmlEscapers;
import com.google.protobuf.ByteString;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import ch.yourclick.kitt.ui.main.SectionsPagerAdapter;
public class MainActivity extends AppCompatActivity implements View.OnClickListener {
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
SectionsPagerAdapter sectionsPagerAdapter = new SectionsPagerAdapter(this,getSupportFragmentManager());
ViewPager viewPager = findViewById(R.id.view_pager);
viewPager.setAdapter(sectionsPagerAdapter);
TabLayout tabs = findViewById(R.id.tabs);
tabs.setupWithViewPager(viewPager);
FloatingActionButton fab = findViewById(R.id.fab);
fab.setonClickListener(new View.OnClickListener() {
@Override
public void onClick(View view) {
Snackbar.make(view,"Replace with your own action",Snackbar.LENGTH_LONG)
.setAction("Action",null).show();
}
});
}
@RequiresApi(api = Build.VERSION_CODES.LOLLIPOP)
@Override
public void onClick(View view) {
int SDK_INT = android.os.Build.VERSION.SDK_INT;
if (SDK_INT > 8)
{
StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder()
.permitAll().build();
StrictMode.setThreadPolicy(policy);
try {
this.hello();
} catch (Exception e) {
e.printstacktrace();
}
}
}
/** Demonstrates using the Text-to-Speech API. */
@RequiresApi(api = Build.VERSION_CODES.KITKAT)
public void hello() throws Exception {
InputStream stream = getResources().openRawResource(R.raw.credential); // R.raw.credential is credential.json
GoogleCredentials credentials = GoogleCredentials.fromStream(stream);
TextToSpeechSettings textToSpeechSettings =
TextToSpeechSettings.newBuilder()
.setCredentialsProvider(
FixedCredentialsProvider.create(credentials)
).build()
;
// Instantiates a client
try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create(textToSpeechSettings)) {
// Set the text input to be synthesized
SynthesisInput input = SynthesisInput.newBuilder().setText("<speak>Step 1,take a deep breath. <break time=\"2000ms\"/> Hello?</speak>").build();
// Build the voice request,select the language code ("en-US") and the ssml voice gender
// ("neutral")
VoiceSelectionParams voice =
VoiceSelectionParams.newBuilder()
.setLanguageCode("en-US")
.setSsmlGender(SsmlVoiceGender.NEUTRAL)
.build();
// Select the type of audio file you want returned
AudioConfig audioConfig =
AudioConfig.newBuilder().setAudioEncoding(AudioEncoding.MP3).build();
// Perform the text-to-speech request on the text input with the selected voice parameters and
// audio file type
SynthesizeSpeechResponse response = textToSpeechClient.synthesizeSpeech(input,voice,audioConfig);
// Get the audio contents from the response
ByteString audioContents = response.getAudioContent();
// Write the response to the output file.
try (FileOutputStream out = new FileOutputStream(getFilesDir() + "/output.mp3")) {
System.out.println(getFilesDir());
out.write(audioContents.toByteArray());
System.out.println("Audio content written to file \"output.mp3\"");
}
String myFile = getFilesDir() + "/output.mp3";
MediaPlayer mediaPlayer = new MediaPlayer();
mediaPlayer.setDataSource(myFile);
mediaPlayer.prepare();
mediaPlayer.start();
}
}
}
正如您在代码中看到的,文本应该是“第 1 步,深呼吸。第 2 步......你好?你在吗?”
嗯,我收到了音频,但听起来不太自然,它以“少说……”开头,这不是重点。
它可能不起作用,因为我需要将该纯文本转换为 SSML。但是,我该怎么做?
我使用的是 Android Studio。
更新
以下方法应该可以正常工作:
public static String textToSsml(String inputFile) throws Exception {
// Read lines of input file
String rawLines = new String(Files.readAllBytes(Paths.get(inputFile)));
// Replace special characters with HTML Ampersand Character Codes
// These codes prevent the API from confusing text with SSML tags
// For example,'<' --> '<' and '&' --> '&'
String escapedLines = HtmlEscapers.htmlEscaper().escape(rawLines);
// Convert plaintext to SSML
// Tag SSML so that there is a 2 second pause between each address
String expandednewline = escapedLines.replaceAll("\\n","\n<break time='2s'/>");
String ssml = "<speak>" + expandednewline + "</speak>";
// Return the concatenated String of SSML
return ssml;
}
参考:https://cloud.google.com/text-to-speech/docs/ssml-tutorial?hl=en#personalizing_synthetic_audio
我仍然不知道如何使用这种方法。但这就是我尝试过的:
package ch.yourclick.kitt;
import android.media.MediaPlayer;
import android.os.Build;
import android.os.Bundle;
import android.os.StrictMode;
import android.view.View;
import androidx.annotation.RequiresApi;
import androidx.appcompat.app.AppCompatActivity;
import androidx.viewpager.widget.ViewPager;
import com.google.android.material.floatingactionbutton.FloatingActionButton;
import com.google.android.material.snackbar.Snackbar;
import com.google.android.material.tabs.TabLayout;
import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.texttospeech.v1.AudioConfig;
import com.google.cloud.texttospeech.v1.AudioEncoding;
import com.google.cloud.texttospeech.v1.SsmlVoiceGender;
import com.google.cloud.texttospeech.v1.SynthesisInput;
import com.google.cloud.texttospeech.v1.SynthesizeSpeechResponse;
import com.google.cloud.texttospeech.v1.TextToSpeechClient;
import com.google.cloud.texttospeech.v1.TextToSpeechSettings;
import com.google.cloud.texttospeech.v1.VoiceSelectionParams;
import com.google.common.html.HtmlEscapers;
import com.google.protobuf.ByteString;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import ch.yourclick.kitt.ui.main.SectionsPagerAdapter;
public class MainActivity extends AppCompatActivity implements View.OnClickListener {
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
SectionsPagerAdapter sectionsPagerAdapter = new SectionsPagerAdapter(this,null).show();
}
});
}
@RequiresApi(api = Build.VERSION_CODES.LOLLIPOP)
@Override
public void onClick(View view) {
int SDK_INT = android.os.Build.VERSION.SDK_INT;
if (SDK_INT > 8)
{
StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder()
.permitAll().build();
StrictMode.setThreadPolicy(policy);
try {
this.hello();
} catch (Exception e) {
e.printstacktrace();
}
}
}
/** Demonstrates using the Text-to-Speech API. */
@RequiresApi(api = Build.VERSION_CODES.KITKAT)
public void hello() throws Exception {
InputStream stream = getResources().openRawResource(R.raw.credential); // R.raw.credential is credential.json
GoogleCredentials credentials = GoogleCredentials.fromStream(stream);
TextToSpeechSettings textToSpeechSettings =
TextToSpeechSettings.newBuilder()
.setCredentialsProvider(
FixedCredentialsProvider.create(credentials)
).build()
;
// Instantiates a client
try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create(textToSpeechSettings)) {
// Set the text input to be synthesized
SynthesisInput input = SynthesisInput.newBuilder().setText("Step 1 \n take a deep breath").build();
// Build the voice request,audioConfig);
// Get the audio contents from the response
ByteString audioContents = response.getAudioContent();
// Write the response to the output file.
try (FileOutputStream out = new FileOutputStream(getFilesDir() + "/output.mp3")) {
System.out.println(getFilesDir());
out.write(audioContents.toByteArray());
System.out.println("Audio content written to file \"output.mp3\"");
}
String myFile = getFilesDir() + "/output.mp3";
MediaPlayer mediaPlayer = new MediaPlayer();
mediaPlayer.setDataSource(myFile);
mediaPlayer.prepare();
mediaPlayer.start();
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
textToSsml(getFilesDir() + "/output.mp3");
}
}
}
@RequiresApi(api = Build.VERSION_CODES.O)
public static String textToSsml(String inputFile) throws Exception {
// Read lines of input file
String rawLines = new String(Files.readAllBytes(Paths.get(inputFile)));
// Replace special characters with HTML Ampersand Character Codes
// These codes prevent the API from confusing text with SSML tags
// For example,'<' --> '<' and '&' --> '&'
String escapedLines = HtmlEscapers.htmlEscaper().escape(rawLines);
// Convert plaintext to SSML
// Tag SSML so that there is a 2 second pause between each address
String expandednewline = escapedLines.replaceAll("\\n","\n<break time='2s'/>");
String ssml = "<speak>" + expandednewline + "</speak>";
// Return the concatenated String of SSML
return ssml;
}
}
嗯,目标是音频将是:“第 1 步”(等待 2 秒)“深呼吸” 但在我的情况下,输出是“第 1 步深呼吸”,因此缺少 2 秒的停顿。我做错了什么?
解决方法
您说,“这可能不起作用,因为我需要将该明文转换为 SSML。”
但这是不正确的。它已经“是”ssml,因为它包含 ssml 标签。
在您的原始代码中,您像这样定义输入:
SynthesisInput input = SynthesisInput.newBuilder().setText("<speak>Step 1,take a deep breath. <break time=\"2000ms\"/> Hello?</speak>").build();
字符串是“
术语“纯文本”令人困惑。
它是一个字符串,它是一个字符串。问题是它打算如何解释。
Google 需要知道是将此字符串解释为纯文本,还是其他“标记语言”,例如 ssml。
为了告诉 Google 您上传的字符串应该被解释为 ssml,您必须使用 setSsml 方法。
但是,您没有使用 setSsml 方法,因此 Google 没有将此 String 解释为 ssml。
试试这个:
String myString = "<speak>Step 1,take a deep breath. <break time=\"2000ms\"/> Hello?</speak>"
SynthesisInput input = SynthesisInput.newBuilder().setSsml(myString).build();
,
您想先说“第 1 步”,然后使用 thread.sleep(2000);
,然后说“深呼吸”