The audio file transcription service can convert an audio file not longer than five hours into text with punctuation and automatically segment the text for easy understanding. In addition, this service supports the generation of text with timestamps, facilitating subsequent function development. In this version, both Chinese and English are supported.
Duration | Size | Sampling Rate | Channel | Sampling Precision | Audio File Format and Encoding Format |
---|---|---|---|---|---|
Less than 1 minute | Less than 4 MB after Base64 encoding | 8kHz–48kHz | Mono | 16 bits |
|
Less than 5 hours | 300 MB | 8kHz–48kHz | Mono | 16 bits |
Country/Region | Mandarin Chinese | English |
---|---|---|
Europe | √ | √ |
Russia | - | √ |
Asia, Africa, and Latin America | √ | √ |
China | √ | √ |
The service is widely used in many fields of daily life and work. If it is inconvenient for a user to listen to a voice message, the user may convert the message into the text through this service. Moreover, this service can be used for video subtitle creation since it supports transcription with timestamps. Audio in a video clip can be converted into text, and a timestamp can be marked, thereby greatly improving subtitle configuration efficiency.
Before API development, you need to make necessary development preparations. Ensure that the Maven repository address of the HMS Core SDK has been configured in your project, and the SDK of this service has been integrated.
// Create an audio transcription engine. MLRemoteAftEngine engine = MLRemoteAftEngine.getInstance(); // Initialize the engine and pass the current context. engine.init(this); // Create an audio file transcription configurator. MLRemoteAftSetting setting = new MLRemoteAftSetting.Factory() // Set the transcription language code, complying with the BCP 47 standard. Currently, Mandarin Chinese and English are supported. .setLanguageCode("zh") // Set whether to generate the text transcription result of each audio segment and the corresponding audio time shift. The default value is false. (This parameter needs to be set only when the audio duration is less than 1 minute.) // true: Return the text transcription result of the audio file, as well as the text transcription result of each audio segment and the corresponding time shift. This applies to transcription of short audio files with a duration of 1 minute or shorter. // false: Return only the text transcription result of the audio file. .enableWordTimeOffset(true) // Set whether to output the time shift of a sentence in the audio file. The default value is false. // true: Return the text transcription result of the audio file, as well as the time shift of a sentence in the audio file. // false: Return only the text transcription result of the audio file. .enableSentenceTimeOffset(true) // Set whether to automatically add punctuations to the converted text. The default value is false. // true: Punctuations will be automatically added to the converted text. // false: Punctuations will not be automatically added to the converted text. .enablePunctuation(true) .create();
private MLRemoteAftListener aftListener = new MLRemoteAftListener() { public void onResult(String taskId, MLRemoteAftResult result, Object ext) { // Obtain the transcription result notification. if (result.isComplete()) { // Process the transcription result. } } @Override public void onError(String taskId, int errorCode, String message) { // Transcription error callback function. } @Override public void onInitComplete(String taskId, Object ext) { // Reserved. } @Override public void onUploadProgress(String taskId, double progress, Object ext) { // Reserved. } @Override public void onEvent(String taskId, int eventId, Object ext) { // Reserved. } };
private MLRemoteAftListener asrListener = new MLRemoteAftListener() { @Override public void onInitComplete(String taskId, Object ext) { Log.e(TAG, "MLAsrCallBack onInitComplete"); // The long audio file is initialized and the transcription starts. start(taskId); } @Override public void onUploadProgress(String taskId, double progress, Object ext) { Log.e(TAG, " MLAsrCallBack onUploadProgress"); } @Override public void onEvent(String taskId, int eventId, Object ext) { // Used for long audio file. Log.e(TAG, "MLAsrCallBack onEvent" + eventId); if (MLAftEvents.UPLOADED_EVENT == eventId) { // The file is uploaded successfully. // Obtain the transcription result. startQueryResult(taskId); } } @Override public void onResult(String taskId, MLRemoteAftResult result, Object ext) { Log.e(TAG, "MLAsrCallBack onResult taskId is :" + taskId + " "); if (result != null) { Log.e(TAG, "MLAsrCallBack onResult isComplete: " + result.isComplete()); if (result.isComplete()) { TimerTask timerTask = timerTaskMap.get(taskId); if (null != timerTask) { timerTask.cancel(); timerTaskMap.remove(taskId); } if (result.getText() != null) { Log.e(TAG, taskId + " MLAsrCallBack onResult result is : " + result.getText()); tvText.setText(result.getText()); } List<MLRemoteAftResult.Segment> words = result.getWords(); if (words != null && words.size() != 0) { for (MLRemoteAftResult.Segment word : words) { Log.e(TAG, "MLAsrCallBack word text is : " + word.getText() + ", startTime is : " + word.getStartTime() + ". endTime is : " + word.getEndTime()); } } List<MLRemoteAftResult.Segment> sentences = result.getSentences(); if (sentences != null && sentences.size() != 0) { for (MLRemoteAftResult.Segment sentence : sentences) { Log.e(TAG, "MLAsrCallBack sentence text is : " + sentence.getText() + ", startTime is : " + sentence.getStartTime() + ". endTime is : " + sentence.getEndTime()); } } } } } @Override public void onError(String taskId, int errorCode, String message) { Log.i(TAG, "MLAsrCallBack onError : " + message + "errorCode, " + errorCode); switch (errorCode) { case MLAftErrors.ERR_AUDIO_FILE_NOTSUPPORTED: break; } } }; // Upload a transcription task. private void start(String taskId) { Log.e(TAG, "start"); engine.setAftListener(asrListener); engine.startTask(taskId); } // Obtain the transcription result. private Map<String, TimerTask> timerTaskMap = new HashMap<>(); private void startQueryResult(final String taskId) { Timer mTimer = new Timer(); TimerTask mTimerTask = new TimerTask() { @Override public void run() { getResult(taskId); } }; // Periodically obtain the long audio file transcription result every 10s. mTimer.schedule(mTimerTask, 5000, 10000); // Clear timerTaskMap before destroying the UI. timerTaskMap.put(taskId, mTimerTask); }
engine.setAftListener(aftListener);
// uri indicates audio resources read from the local storage or recorder. Only local audio files with duration (not longer than 1 minute) whose URIs starting with content:// or file:// are supported. engine.shortRecognize(uri, setting); // longRecognize is an API used to convert audio files with a duration ranging from 1 minute to 5 hours. engine.longRecognize(uri, setting);
MLRemoteAftEngine.getInstance().getShortAftLanguages(new MLRemoteAftEngine.LanguageCallback() { @Override public void onResult(final List<String> result) { Log.i(TAG, "support languages==" + result.toString()); } @Override public void onError(int errorCode, String errorMsg) { Log.e(TAG, "errorCode:" + errorCode + "errorMsg:" + errorMsg); } });
MLRemoteAftEngine.getInstance().getLongAftLanguages(new MLRemoteAftEngine.LanguageCallback() { @Override public void onResult(final List<String> result) { Log.i(TAG, "support languages==" + result.toString()); } @Override public void onError(int errorCode, String errorMsg) { Log.e(TAG, "errorCode:" + errorCode + "errorMsg:" + errorMsg); } });