feat(speech): 新增语音转文字功能

新增 Deepgram 集成,支持音频文件上传、格式校验与转写;补充相关错误码并放行 /speech/transcribe 接口
This commit is contained in:
2026-01-27 18:17:36 +08:00
parent f18217ba93
commit 6cf0275980
9 changed files with 328 additions and 5 deletions

View File

@@ -2,6 +2,6 @@
"active": true,
"started_at": "2026-01-26T13:01:18.447Z",
"original_prompt": "刚刚回滚了代码现在AI陪聊角色评论需要使用KeyboardAiCompanionCommentLikeService添加一个评论点赞接口用来记录点赞和取消点赞。 ulw",
"reinforcement_count": 4,
"last_checked_at": "2026-01-26T13:55:34.306Z"
"reinforcement_count": 5,
"last_checked_at": "2026-01-27T05:14:53.054Z"
}

View File

@@ -70,7 +70,11 @@ public enum ErrorCode {
INVITE_CODE_ALREADY_BOUND(50028, "您已绑定过邀请码,无法重复绑定"),
INVITE_CODE_CANNOT_BIND_SELF(50029, "不能绑定自己的邀请码"),
RECEIPT_ALREADY_PROCESSED(50027, "收据已处理"),
VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员");
VIP_TRIAL_LIMIT_REACHED(50030, "今日体验次数已达上限,请开通会员"),
AUDIO_FILE_EMPTY(40016, "音频文件不能为空"),
AUDIO_FILE_TOO_LARGE(40017, "音频文件过大"),
AUDIO_FORMAT_NOT_SUPPORTED(40018, "音频格式不支持"),
STT_SERVICE_ERROR(50031, "语音转文字服务异常");
/**
* 状态码

View File

@@ -0,0 +1,34 @@
package com.yolo.keyborad.config;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
/**
* Deepgram STT 配置
*
* @author ziin
*/
@Data
@Component
@ConfigurationProperties(prefix = "deepgram")
public class DeepgramProperties {
/** API Key */
private String apiKey;
/** 基础 URL */
private String baseUrl = "https://api.deepgram.com/v1";
/** 模型 ID */
private String model = "nova-2";
/** 默认语言 */
private String language = "en";
/** 智能格式化 */
private Boolean smartFormat = true;
/** 添加标点符号 */
private Boolean punctuate = true;
}

View File

@@ -114,7 +114,8 @@ public class SaTokenConfigure implements WebMvcConfigurer {
"/chat/audio/*",
"/ai-companion/page",
"/chat/history",
"/ai-companion/comment/add"
"/ai-companion/comment/add",
"/speech/transcribe"
};
}
@Bean

View File

@@ -0,0 +1,34 @@
package com.yolo.keyborad.controller;
import com.yolo.keyborad.common.BaseResponse;
import com.yolo.keyborad.common.ResultUtils;
import com.yolo.keyborad.model.vo.SpeechToTextVO;
import com.yolo.keyborad.service.DeepgramService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
/**
* 语音服务控制器
*
* @author ziin
*/
@RestController
@Slf4j
@RequestMapping("/speech")
@Tag(name = "语音服务", description = "语音相关功能接口")
public class SpeechController {
@Resource
private DeepgramService deepgramService;
@PostMapping("/transcribe")
@Operation(summary = "语音转文字", description = "上传音频文件并转换为文本")
public BaseResponse<SpeechToTextVO> transcribe(@RequestPart("file") MultipartFile file) {
SpeechToTextVO result = deepgramService.transcribe(file);
return ResultUtils.success(result);
}
}

View File

@@ -0,0 +1,32 @@
package com.yolo.keyborad.model.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* 语音转文字响应VO
*
* @author ziin
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Schema(description = "语音转文字响应")
public class SpeechToTextVO {
@Schema(description = "转录文本")
private String transcript;
@Schema(description = "置信度")
private Double confidence;
@Schema(description = "音频时长(秒)")
private Double duration;
@Schema(description = "检测到的语言")
private String detectedLanguage;
}

View File

@@ -0,0 +1,29 @@
package com.yolo.keyborad.service;
import com.yolo.keyborad.model.vo.SpeechToTextVO;
import org.springframework.web.multipart.MultipartFile;
/**
* Deepgram STT 语音转文字服务接口
*
* @author ziin
*/
public interface DeepgramService {
/**
* 将音频文件转换为文字(使用默认语言)
*
* @param audioFile 音频文件
* @return 语音转文字结果
*/
SpeechToTextVO transcribe(MultipartFile audioFile);
/**
* 将音频文件转换为文字(指定语言)
*
* @param audioFile 音频文件
* @param language 语言代码(如 en, zh, ja 等)
* @return 语音转文字结果
*/
SpeechToTextVO transcribe(MultipartFile audioFile, String language);
}

View File

@@ -0,0 +1,182 @@
package com.yolo.keyborad.service.impl;
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.yolo.keyborad.common.ErrorCode;
import com.yolo.keyborad.config.DeepgramProperties;
import com.yolo.keyborad.exception.BusinessException;
import com.yolo.keyborad.model.vo.SpeechToTextVO;
import com.yolo.keyborad.service.DeepgramService;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.MediaType;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestClient;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
/**
* Deepgram STT 语音转文字服务实现
* 参考: https://developers.deepgram.com/docs/getting-started-with-pre-recorded-audio
*
* @author ziin
*/
@Service
@Slf4j
public class DeepgramServiceImpl implements DeepgramService {
@Resource
private DeepgramProperties deepgramProperties;
@Resource
private RestClient restClient;
// 支持的音频MIME类型
private static final List<String> ALLOWED_AUDIO_TYPES = Arrays.asList(
"audio/wav", "audio/wave",
"audio/mp3", "audio/mpeg",
"audio/webm",
"audio/ogg",
"audio/flac",
"audio/m4a"
);
// 最大文件大小20MB
private static final long MAX_FILE_SIZE = 20 * 1024 * 1024;
@Override
public SpeechToTextVO transcribe(MultipartFile audioFile) {
return transcribe(audioFile, deepgramProperties.getLanguage());
}
@Override
public SpeechToTextVO transcribe(MultipartFile audioFile, String language) {
// 1. 参数校验
validateAudioFile(audioFile);
if (StrUtil.isBlank(language)) {
language = deepgramProperties.getLanguage();
}
// 2. 获取音频Content-Type
String contentType = audioFile.getContentType();
if (StrUtil.isBlank(contentType) || !ALLOWED_AUDIO_TYPES.contains(contentType)) {
log.warn("不支持的音频格式: {}", contentType);
throw new BusinessException(ErrorCode.AUDIO_FORMAT_NOT_SUPPORTED);
}
// 3. 构建请求URL
String requestUrl = buildRequestUrl(language);
log.info("调用 Deepgram STT API, language: {}, contentType: {}, 文件大小: {} bytes",
language, contentType, audioFile.getSize());
long startTime = System.currentTimeMillis();
try {
// 4. 发送请求
byte[] audioBytes = audioFile.getBytes();
String responseJson = restClient.post()
.uri(requestUrl)
.contentType(MediaType.parseMediaType(contentType))
.header("Authorization", "Token " + deepgramProperties.getApiKey())
.body(audioBytes)
.retrieve()
.body(String.class);
long duration = System.currentTimeMillis() - startTime;
log.info("Deepgram STT API 响应成功, 耗时: {}ms", duration);
// 5. 解析响应
return parseResponse(responseJson);
} catch (IOException e) {
log.error("读取音频文件失败", e);
throw new BusinessException(ErrorCode.SYSTEM_ERROR, "音频文件读取失败: " + e.getMessage());
} catch (Exception e) {
log.error("调用 Deepgram STT API 发生异常", e);
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "语音转文字服务异常: " + e.getMessage());
}
}
/**
* 校验音频文件
*/
private void validateAudioFile(MultipartFile audioFile) {
if (audioFile == null || audioFile.isEmpty()) {
throw new BusinessException(ErrorCode.AUDIO_FILE_EMPTY);
}
if (audioFile.getSize() > MAX_FILE_SIZE) {
throw new BusinessException(ErrorCode.AUDIO_FILE_TOO_LARGE);
}
}
/**
* 构建请求URL
*/
private String buildRequestUrl(String language) {
StringBuilder url = new StringBuilder(deepgramProperties.getBaseUrl());
url.append("/listen");
// 添加查询参数
url.append("?model=").append(deepgramProperties.getModel());
url.append("&language=").append(language);
if (deepgramProperties.getSmartFormat()) {
url.append("&smart_format=true");
}
if (deepgramProperties.getPunctuate()) {
url.append("&punctuate=true");
}
return url.toString();
}
/**
* 解析响应JSON
*/
private SpeechToTextVO parseResponse(String responseJson) {
JSONObject jsonResponse = JSONObject.parseObject(responseJson);
// 解析 metadata
JSONObject metadata = jsonResponse.getJSONObject("metadata");
Double duration = metadata != null ? metadata.getDouble("duration") : null;
// 解析 results
JSONObject results = jsonResponse.getJSONObject("results");
if (results == null) {
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 results");
}
JSONArray channels = results.getJSONArray("channels");
if (channels == null || channels.isEmpty()) {
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 channels");
}
JSONObject channel = channels.getJSONObject(0);
JSONArray alternatives = channel.getJSONArray("alternatives");
if (alternatives == null || alternatives.isEmpty()) {
throw new BusinessException(ErrorCode.STT_SERVICE_ERROR, "响应格式错误: 缺少 alternatives");
}
JSONObject alternative = alternatives.getJSONObject(0);
String transcript = alternative.getString("transcript");
Double confidence = alternative.getDouble("confidence");
String detectedLanguage = channel.getString("detected_language");
log.info("转录成功, 文本长度: {}, 置信度: {}, 检测语言: {}",
transcript != null ? transcript.length() : 0, confidence, detectedLanguage);
return SpeechToTextVO.builder()
.transcript(transcript)
.confidence(confidence)
.duration(duration)
.detectedLanguage(detectedLanguage)
.build();
}
}

View File

@@ -106,3 +106,10 @@ elevenlabs:
voice-id: JBFqnCBsd6RMkjVDRZzb
model-id: eleven_turbo_v2_5
output-format: mp3_44100_128
deepgram:
api-key: 9c792eb63a65d644cbc95785155754cd1e84f8cf
model: nova-2
language: en
smart-format: true
punctuate: true