feat(speech): 新增语音转文字功能
新增 Deepgram 集成,支持音频文件上传、格式校验与转写;补充相关错误码并放行 /speech/transcribe 接口
This commit is contained in:
@@ -2,6 +2,6 @@
|
||||
"active": true,
|
||||
"started_at": "2026-01-26T13:01:18.447Z",
|
||||
"original_prompt": "刚刚回滚了代码,现在AI陪聊角色评论需要使用KeyboardAiCompanionCommentLikeService添加一个评论点赞接口,用来记录点赞和取消点赞。 ulw",
|
||||
"reinforcement_count": 4,
|
||||
"last_checked_at": "2026-01-26T13:55:34.306Z"
|
||||
"reinforcement_count": 5,
|
||||
"last_checked_at": "2026-01-27T05:14:53.054Z"
|
||||
}
|
||||
@@ -70,7 +70,11 @@ public enum ErrorCode {
|
||||
INVITE_CODE_ALREADY_BOUND(50028, "您已绑定过邀请码,无法重复绑定"),
|
||||
INVITE_CODE_CANNOT_BIND_SELF(50029, "不能绑定自己的邀请码"),
|
||||
RECEIPT_ALREADY_PROCESSED(50027, "收据已处理"),
|
||||
VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员");
|
||||
VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员"),
|
||||
AUDIO_FILE_EMPTY(40016, "音频文件不能为空"),
|
||||
AUDIO_FILE_TOO_LARGE(40017, "音频文件过大"),
|
||||
AUDIO_FORMAT_NOT_SUPPORTED(40018, "音频格式不支持"),
|
||||
STT_SERVICE_ERROR(50031, "语音转文字服务异常");
|
||||
|
||||
/**
|
||||
* 状态码
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
package com.yolo.keyborad.config;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* Deepgram STT 配置
|
||||
*
|
||||
* @author ziin
|
||||
*/
|
||||
@Data
|
||||
@Component
|
||||
@ConfigurationProperties(prefix = "deepgram")
|
||||
public class DeepgramProperties {
|
||||
|
||||
/** API Key */
|
||||
private String apiKey;
|
||||
|
||||
/** 基础 URL */
|
||||
private String baseUrl = "https://api.deepgram.com/v1";
|
||||
|
||||
/** 模型 ID */
|
||||
private String model = "nova-2";
|
||||
|
||||
/** 默认语言 */
|
||||
private String language = "en";
|
||||
|
||||
/** 智能格式化 */
|
||||
private Boolean smartFormat = true;
|
||||
|
||||
/** 添加标点符号 */
|
||||
private Boolean punctuate = true;
|
||||
}
|
||||
@@ -114,7 +114,8 @@ public class SaTokenConfigure implements WebMvcConfigurer {
|
||||
"/chat/audio/*",
|
||||
"/ai-companion/page",
|
||||
"/chat/history",
|
||||
"/ai-companion/comment/add"
|
||||
"/ai-companion/comment/add",
|
||||
"/speech/transcribe"
|
||||
};
|
||||
}
|
||||
@Bean
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
package com.yolo.keyborad.controller;
|
||||
|
||||
import com.yolo.keyborad.common.BaseResponse;
|
||||
import com.yolo.keyborad.common.ResultUtils;
|
||||
import com.yolo.keyborad.model.vo.SpeechToTextVO;
|
||||
import com.yolo.keyborad.service.DeepgramService;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import jakarta.annotation.Resource;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
/**
|
||||
* 语音服务控制器
|
||||
*
|
||||
* @author ziin
|
||||
*/
|
||||
@RestController
|
||||
@Slf4j
|
||||
@RequestMapping("/speech")
|
||||
@Tag(name = "语音服务", description = "语音相关功能接口")
|
||||
public class SpeechController {
|
||||
|
||||
@Resource
|
||||
private DeepgramService deepgramService;
|
||||
|
||||
@PostMapping("/transcribe")
|
||||
@Operation(summary = "语音转文字", description = "上传音频文件并转换为文本")
|
||||
public BaseResponse<SpeechToTextVO> transcribe(@RequestPart("file") MultipartFile file) {
|
||||
SpeechToTextVO result = deepgramService.transcribe(file);
|
||||
return ResultUtils.success(result);
|
||||
}
|
||||
}
|
||||
32
src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java
Normal file
32
src/main/java/com/yolo/keyborad/model/vo/SpeechToTextVO.java
Normal file
@@ -0,0 +1,32 @@
|
||||
package com.yolo.keyborad.model.vo;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
/**
|
||||
* 语音转文字响应VO
|
||||
*
|
||||
* @author ziin
|
||||
*/
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "语音转文字响应")
|
||||
public class SpeechToTextVO {
|
||||
|
||||
@Schema(description = "转录文本")
|
||||
private String transcript;
|
||||
|
||||
@Schema(description = "置信度")
|
||||
private Double confidence;
|
||||
|
||||
@Schema(description = "音频时长(秒)")
|
||||
private Double duration;
|
||||
|
||||
@Schema(description = "检测到的语言")
|
||||
private String detectedLanguage;
|
||||
}
|
||||
29
src/main/java/com/yolo/keyborad/service/DeepgramService.java
Normal file
29
src/main/java/com/yolo/keyborad/service/DeepgramService.java
Normal file
@@ -0,0 +1,29 @@
|
||||
package com.yolo.keyborad.service;
|
||||
|
||||
import com.yolo.keyborad.model.vo.SpeechToTextVO;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
/**
|
||||
* Deepgram STT 语音转文字服务接口
|
||||
*
|
||||
* @author ziin
|
||||
*/
|
||||
public interface DeepgramService {
|
||||
|
||||
/**
|
||||
* 将音频文件转换为文字(使用默认语言)
|
||||
*
|
||||
* @param audioFile 音频文件
|
||||
* @return 语音转文字结果
|
||||
*/
|
||||
SpeechToTextVO transcribe(MultipartFile audioFile);
|
||||
|
||||
/**
|
||||
* 将音频文件转换为文字(指定语言)
|
||||
*
|
||||
* @param audioFile 音频文件
|
||||
* @param language 语言代码(如 en, zh, ja 等)
|
||||
* @return 语音转文字结果
|
||||
*/
|
||||
SpeechToTextVO transcribe(MultipartFile audioFile, String language);
|
||||
}
|
||||
@@ -0,0 +1,182 @@
|
||||
package com.yolo.keyborad.service.impl;
|
||||
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.yolo.keyborad.common.ErrorCode;
|
||||
import com.yolo.keyborad.config.DeepgramProperties;
|
||||
import com.yolo.keyborad.exception.BusinessException;
|
||||
import com.yolo.keyborad.model.vo.SpeechToTextVO;
|
||||
import com.yolo.keyborad.service.DeepgramService;
|
||||
import jakarta.annotation.Resource;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.client.RestClient;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Deepgram STT 语音转文字服务实现
|
||||
* 参考: https://developers.deepgram.com/docs/getting-started-with-pre-recorded-audio
|
||||
*
|
||||
* @author ziin
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class DeepgramServiceImpl implements DeepgramService {
|
||||
|
||||
@Resource
|
||||
private DeepgramProperties deepgramProperties;
|
||||
|
||||
@Resource
|
||||
private RestClient restClient;
|
||||
|
||||
// 支持的音频MIME类型
|
||||
private static final List<String> ALLOWED_AUDIO_TYPES = Arrays.asList(
|
||||
"audio/wav", "audio/wave",
|
||||
"audio/mp3", "audio/mpeg",
|
||||
"audio/webm",
|
||||
"audio/ogg",
|
||||
"audio/flac",
|
||||
"audio/m4a"
|
||||
);
|
||||
|
||||
// 最大文件大小:20MB
|
||||
private static final long MAX_FILE_SIZE = 20 * 1024 * 1024;
|
||||
|
||||
@Override
|
||||
public SpeechToTextVO transcribe(MultipartFile audioFile) {
|
||||
return transcribe(audioFile, deepgramProperties.getLanguage());
|
||||
}
|
||||
|
||||
@Override
|
||||
public SpeechToTextVO transcribe(MultipartFile audioFile, String language) {
|
||||
// 1. 参数校验
|
||||
validateAudioFile(audioFile);
|
||||
|
||||
if (StrUtil.isBlank(language)) {
|
||||
language = deepgramProperties.getLanguage();
|
||||
}
|
||||
|
||||
// 2. 获取音频Content-Type
|
||||
String contentType = audioFile.getContentType();
|
||||
if (StrUtil.isBlank(contentType) || !ALLOWED_AUDIO_TYPES.contains(contentType)) {
|
||||
log.warn("不支持的音频格式: {}", contentType);
|
||||
throw new BusinessException(ErrorCode.AUDIO_FORMAT_NOT_SUPPORTED);
|
||||
}
|
||||
|
||||
// 3. 构建请求URL
|
||||
String requestUrl = buildRequestUrl(language);
|
||||
|
||||
log.info("调用 Deepgram STT API, language: {}, contentType: {}, 文件大小: {} bytes",
|
||||
language, contentType, audioFile.getSize());
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
try {
|
||||
// 4. 发送请求
|
||||
byte[] audioBytes = audioFile.getBytes();
|
||||
|
||||
String responseJson = restClient.post()
|
||||
.uri(requestUrl)
|
||||
.contentType(MediaType.parseMediaType(contentType))
|
||||
.header("Authorization", "Token " + deepgramProperties.getApiKey())
|
||||
.body(audioBytes)
|
||||
.retrieve()
|
||||
.body(String.class);
|
||||
|
||||
long duration = System.currentTimeMillis() - startTime;
|
||||
log.info("Deepgram STT API 响应成功, 耗时: {}ms", duration);
|
||||
|
||||
// 5. 解析响应
|
||||
return parseResponse(responseJson);
|
||||
|
||||
} catch (IOException e) {
|
||||
log.error("读取音频文件失败", e);
|
||||
throw new BusinessException(ErrorCode.SYSTEM_ERROR, "音频文件读取失败: " + e.getMessage());
|
||||
} catch (Exception e) {
|
||||
log.error("调用 Deepgram STT API 发生异常", e);
|
||||
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "语音转文字服务异常: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 校验音频文件
|
||||
*/
|
||||
private void validateAudioFile(MultipartFile audioFile) {
|
||||
if (audioFile == null || audioFile.isEmpty()) {
|
||||
throw new BusinessException(ErrorCode.AUDIO_FILE_EMPTY);
|
||||
}
|
||||
|
||||
if (audioFile.getSize() > MAX_FILE_SIZE) {
|
||||
throw new BusinessException(ErrorCode.AUDIO_FILE_TOO_LARGE);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建请求URL
|
||||
*/
|
||||
private String buildRequestUrl(String language) {
|
||||
StringBuilder url = new StringBuilder(deepgramProperties.getBaseUrl());
|
||||
url.append("/listen");
|
||||
|
||||
// 添加查询参数
|
||||
url.append("?model=").append(deepgramProperties.getModel());
|
||||
url.append("&language=").append(language);
|
||||
|
||||
if (deepgramProperties.getSmartFormat()) {
|
||||
url.append("&smart_format=true");
|
||||
}
|
||||
if (deepgramProperties.getPunctuate()) {
|
||||
url.append("&punctuate=true");
|
||||
}
|
||||
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析响应JSON
|
||||
*/
|
||||
private SpeechToTextVO parseResponse(String responseJson) {
|
||||
JSONObject jsonResponse = JSONObject.parseObject(responseJson);
|
||||
|
||||
// 解析 metadata
|
||||
JSONObject metadata = jsonResponse.getJSONObject("metadata");
|
||||
Double duration = metadata != null ? metadata.getDouble("duration") : null;
|
||||
|
||||
// 解析 results
|
||||
JSONObject results = jsonResponse.getJSONObject("results");
|
||||
if (results == null) {
|
||||
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 results");
|
||||
}
|
||||
|
||||
JSONArray channels = results.getJSONArray("channels");
|
||||
if (channels == null || channels.isEmpty()) {
|
||||
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 channels");
|
||||
}
|
||||
|
||||
JSONObject channel = channels.getJSONObject(0);
|
||||
JSONArray alternatives = channel.getJSONArray("alternatives");
|
||||
if (alternatives == null || alternatives.isEmpty()) {
|
||||
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 alternatives");
|
||||
}
|
||||
|
||||
JSONObject alternative = alternatives.getJSONObject(0);
|
||||
String transcript = alternative.getString("transcript");
|
||||
Double confidence = alternative.getDouble("confidence");
|
||||
String detectedLanguage = channel.getString("detected_language");
|
||||
|
||||
log.info("转录成功, 文本长度: {}, 置信度: {}, 检测语言: {}",
|
||||
transcript != null ? transcript.length() : 0, confidence, detectedLanguage);
|
||||
|
||||
return SpeechToTextVO.builder()
|
||||
.transcript(transcript)
|
||||
.confidence(confidence)
|
||||
.duration(duration)
|
||||
.detectedLanguage(detectedLanguage)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
@@ -105,4 +105,11 @@ elevenlabs:
|
||||
api-key: sk_25339d32bb14c91f460ed9fce83a1951672f07846a7a10ce
|
||||
voice-id: JBFqnCBsd6RMkjVDRZzb
|
||||
model-id: eleven_turbo_v2_5
|
||||
output-format: mp3_44100_128
|
||||
output-format: mp3_44100_128
|
||||
|
||||
deepgram:
|
||||
api-key: 9c792eb63a65d644cbc95785155754cd1e84f8cf
|
||||
model: nova-2
|
||||
language: en
|
||||
smart-format: true
|
||||
punctuate: true
|
||||
Reference in New Issue
Block a user