diff --git a/keyBoard.xcodeproj/project.pbxproj b/keyBoard.xcodeproj/project.pbxproj index bdcf1bb..f388528 100644 --- a/keyBoard.xcodeproj/project.pbxproj +++ b/keyBoard.xcodeproj/project.pbxproj @@ -198,6 +198,10 @@ 04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038D72F20BFFB002CA5A0 /* websocket-api.md */; }; 04E038DD2F20C420002CA5A0 /* VoiceChatStreamingManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */; }; 04E038DE2F20C420002CA5A0 /* VoiceChatWebSocketClient.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */; }; + 04E038E32F20E500002CA5A0 /* deepgramAPI.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038E22F20E500002CA5A0 /* deepgramAPI.md */; }; + 04E038E82F20E877002CA5A0 /* DeepgramWebSocketClient.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038E72F20E877002CA5A0 /* DeepgramWebSocketClient.m */; }; + 04E038E92F20E877002CA5A0 /* DeepgramStreamingManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038E52F20E877002CA5A0 /* DeepgramStreamingManager.m */; }; + 04E038EF2F21F0EC002CA5A0 /* AiVM.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038EE2F21F0EC002CA5A0 /* AiVM.m */; }; 04E161832F10E6470022C23B /* normal_hei_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161812F10E6470022C23B /* normal_hei_them.zip */; }; 04E161842F10E6470022C23B /* normal_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 04E161822F10E6470022C23B /* normal_them.zip */; }; 04FC95672EB0546C007BD342 /* KBKey.m in Sources */ = {isa = PBXBuildFile; fileRef = 04FC95652EB0546C007BD342 /* KBKey.m */; }; @@ -616,6 +620,13 @@ 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatStreamingManager.m; sourceTree = ""; }; 04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VoiceChatWebSocketClient.h; sourceTree = ""; }; 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VoiceChatWebSocketClient.m; sourceTree = ""; }; + 04E038E22F20E500002CA5A0 /* deepgramAPI.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = deepgramAPI.md; sourceTree = ""; }; + 04E038E42F20E877002CA5A0 /* DeepgramStreamingManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DeepgramStreamingManager.h; sourceTree = ""; }; + 04E038E52F20E877002CA5A0 /* DeepgramStreamingManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DeepgramStreamingManager.m; sourceTree = ""; }; + 04E038E62F20E877002CA5A0 /* DeepgramWebSocketClient.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DeepgramWebSocketClient.h; sourceTree = ""; }; + 04E038E72F20E877002CA5A0 /* DeepgramWebSocketClient.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DeepgramWebSocketClient.m; sourceTree = ""; }; + 04E038ED2F21F0EC002CA5A0 /* AiVM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AiVM.h; sourceTree = ""; }; + 04E038EE2F21F0EC002CA5A0 /* AiVM.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AiVM.m; sourceTree = ""; }; 04E161812F10E6470022C23B /* normal_hei_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_hei_them.zip; sourceTree = ""; }; 04E161822F10E6470022C23B /* normal_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_them.zip; sourceTree = ""; }; 04FC953A2EAFAE56007BD342 /* KeyBoardPrefixHeader.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KeyBoardPrefixHeader.pch; sourceTree = ""; }; @@ -998,6 +1009,12 @@ 04E038DA2F20C420002CA5A0 /* VoiceChatStreamingManager.m */, 04E038DB2F20C420002CA5A0 /* VoiceChatWebSocketClient.h */, 04E038DC2F20C420002CA5A0 /* VoiceChatWebSocketClient.m */, + 04E038E42F20E877002CA5A0 /* DeepgramStreamingManager.h */, + 04E038E52F20E877002CA5A0 /* DeepgramStreamingManager.m */, + 04E038E62F20E877002CA5A0 /* DeepgramWebSocketClient.h */, + 04E038E72F20E877002CA5A0 /* DeepgramWebSocketClient.m */, + 04E038ED2F21F0EC002CA5A0 /* AiVM.h */, + 04E038EE2F21F0EC002CA5A0 /* AiVM.m */, ); path = VM; sourceTree = ""; @@ -1007,6 +1024,7 @@ children = ( 046086742F191CC700757C95 /* AI技术分析.txt */, 04E038D72F20BFFB002CA5A0 /* websocket-api.md */, + 04E038E22F20E500002CA5A0 /* deepgramAPI.md */, 0460866C2F191A5100757C95 /* M */, 0460866D2F191A5100757C95 /* V */, 0460866E2F191A5100757C95 /* VC */, @@ -2027,6 +2045,7 @@ 04286A132ECDEBF900CE730C /* KBSkinIconMap.strings in Resources */, 04C6EABD2EAF86530089C901 /* Main.storyboard in Resources */, 046086CB2F1A092500757C95 /* comments_mock.json in Resources */, + 04E038E32F20E500002CA5A0 /* deepgramAPI.md in Resources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -2233,6 +2252,8 @@ 0498BD712EE02A41006CC1D5 /* KBForgetPwdNewPwdVC.m in Sources */, 048908EF2EBF861800FABA60 /* KBSkinSectionTitleCell.m in Sources */, 0450AAE22EF03D5100B6AF06 /* KBPerson.swift in Sources */, + 04E038E82F20E877002CA5A0 /* DeepgramWebSocketClient.m in Sources */, + 04E038E92F20E877002CA5A0 /* DeepgramStreamingManager.m in Sources */, 048908E32EBF821700FABA60 /* KBSkinDetailVC.m in Sources */, 0477BDF32EBB7B850055D639 /* KBDirectionIndicatorView.m in Sources */, 049FB21A2EC20A9E00FAB05D /* KBMyKeyBoardVC.m in Sources */, @@ -2275,6 +2296,7 @@ 04FC97092EB31B14007BD342 /* KBHUD.m in Sources */, 04FC970E2EB334F8007BD342 /* UIImageView+KBWebImage.m in Sources */, 049FB2232EC311F900FAB05D /* KBPersonInfoVC.m in Sources */, + 04E038EF2F21F0EC002CA5A0 /* AiVM.m in Sources */, 0498BD6B2EE025FC006CC1D5 /* KBForgetPwdVC.m in Sources */, 046086B12F19239B00757C95 /* SubtitleSync.m in Sources */, 046086B22F19239B00757C95 /* TTSServiceClient.m in Sources */, diff --git a/keyBoard/Class/AiTalk/VC/KBAiMainVC.m b/keyBoard/Class/AiTalk/VC/KBAiMainVC.m index cc3b9df..481a632 100644 --- a/keyBoard/Class/AiTalk/VC/KBAiMainVC.m +++ b/keyBoard/Class/AiTalk/VC/KBAiMainVC.m @@ -7,6 +7,7 @@ #import "KBAiMainVC.h" #import "ConversationOrchestrator.h" +#import "DeepgramStreamingManager.h" #import "KBAICommentView.h" #import "KBAiChatView.h" #import "KBAiRecordButton.h" @@ -15,13 +16,15 @@ #import "KBUserSessionManager.h" @interface KBAiMainVC () + VoiceChatStreamingManagerDelegate, + DeepgramStreamingManagerDelegate> @property(nonatomic, weak) LSTPopView *popView; // UI @property(nonatomic, strong) KBAiChatView *chatView; @property(nonatomic, strong) KBAiRecordButton *recordButton; @property(nonatomic, strong) UILabel *statusLabel; +@property(nonatomic, strong) UILabel *transcriptLabel; @property(nonatomic, strong) UIButton *commentButton; @property(nonatomic, strong) KBAICommentView *commentView; @property(nonatomic, strong) UIView *tabbarBackgroundView; @@ -32,9 +35,11 @@ // 核心模块 @property(nonatomic, strong) ConversationOrchestrator *orchestrator; @property(nonatomic, strong) VoiceChatStreamingManager *streamingManager; +@property(nonatomic, strong) DeepgramStreamingManager *deepgramManager; // 文本跟踪 @property(nonatomic, strong) NSMutableString *assistantVisibleText; +@property(nonatomic, strong) NSMutableString *deepgramFullText; // 日志节流 @property(nonatomic, assign) NSTimeInterval lastRMSLogTime; @@ -55,6 +60,7 @@ [self setupUI]; [self setupOrchestrator]; [self setupStreamingManager]; + [self setupDeepgramManager]; } - (void)viewWillAppear:(BOOL)animated { @@ -68,6 +74,7 @@ // 页面消失时停止对话 [self.orchestrator stop]; [self.streamingManager disconnect]; + [self.deepgramManager disconnect]; } - (void)viewDidLayoutSubviews { @@ -132,6 +139,16 @@ self.statusLabel.translatesAutoresizingMaskIntoConstraints = NO; [self.view addSubview:self.statusLabel]; + // 转写文本标签 + self.transcriptLabel = [[UILabel alloc] init]; + self.transcriptLabel.text = @""; + self.transcriptLabel.font = [UIFont systemFontOfSize:16]; + self.transcriptLabel.textColor = [UIColor labelColor]; + self.transcriptLabel.numberOfLines = 0; + self.transcriptLabel.textAlignment = NSTextAlignmentLeft; + self.transcriptLabel.translatesAutoresizingMaskIntoConstraints = NO; + [self.view addSubview:self.transcriptLabel]; + // 聊天视图 // self.chatView = [[KBAiChatView alloc] init]; // self.chatView.backgroundColor = [UIColor systemBackgroundColor]; @@ -177,6 +194,13 @@ make.right.equalTo(self.view).offset(-16); }]; + [self.transcriptLabel mas_makeConstraints:^(MASConstraintMaker *make) { + make.top.equalTo(self.statusLabel.mas_bottom).offset(8); + make.left.equalTo(self.view).offset(16); + make.right.equalTo(self.view).offset(-16); + make.bottom.lessThanOrEqualTo(self.recordButton.mas_top).offset(-16); + }]; + [self.recordButton mas_makeConstraints:^(MASConstraintMaker *make) { make.left.equalTo(self.view.mas_safeAreaLayoutGuideLeft).offset(20); make.right.equalTo(self.view.mas_safeAreaLayoutGuideRight).offset(-20); @@ -304,6 +328,26 @@ self.lastRMSLogTime = 0; } +#pragma mark - Deepgram Manager + +- (void)setupDeepgramManager { + self.deepgramManager = [[DeepgramStreamingManager alloc] init]; + self.deepgramManager.delegate = self; + self.deepgramManager.serverURL = @"wss://api.deepgram.com/v1/listen"; + self.deepgramManager.apiKey = @"9c792eb63a65d644cbc95785155754cd1e84f8cf"; + self.deepgramManager.language = @"en"; + self.deepgramManager.model = @"nova-3"; + self.deepgramManager.punctuate = YES; + self.deepgramManager.smartFormat = YES; + self.deepgramManager.interimResults = YES; + self.deepgramManager.encoding = @"linear16"; + self.deepgramManager.sampleRate = 16000.0; + self.deepgramManager.channels = 1; + [self.deepgramManager prepareConnection]; + + self.deepgramFullText = [[NSMutableString alloc] init]; +} + #pragma mark - 事件 - (void)showComment { CGFloat customViewHeight = KB_SCREEN_HEIGHT * (0.8); @@ -402,17 +446,19 @@ self.statusLabel.text = @"正在连接..."; self.recordButton.state = KBAiRecordButtonStateRecording; - [self.streamingManager startWithToken:token language:@"en-US" voiceId:nil]; + [self.deepgramFullText setString:@""]; + self.transcriptLabel.text = @""; + [self.deepgramManager start]; } - (void)recordButtonDidEndPress:(KBAiRecordButton *)button { NSLog(@"[KBAiMainVC] Record button end press"); - [self.streamingManager stopAndFinalize]; + [self.deepgramManager stopAndFinalize]; } - (void)recordButtonDidCancelPress:(KBAiRecordButton *)button { NSLog(@"[KBAiMainVC] Record button cancel press"); - [self.streamingManager cancel]; + [self.deepgramManager cancel]; } #pragma mark - VoiceChatStreamingManagerDelegate @@ -501,4 +547,55 @@ [self showError:error]; } +#pragma mark - DeepgramStreamingManagerDelegate + +- (void)deepgramStreamingManagerDidConnect { + self.statusLabel.text = @"已连接,准备中..."; +} + +- (void)deepgramStreamingManagerDidDisconnect:(NSError *_Nullable)error { + self.recordButton.state = KBAiRecordButtonStateNormal; + if (error) { + [self showError:error]; + } +} + +- (void)deepgramStreamingManagerDidUpdateRMS:(float)rms { + [self.recordButton updateVolumeRMS:rms]; + NSTimeInterval now = [[NSDate date] timeIntervalSince1970]; + if (now - self.lastRMSLogTime >= 1.0) { + self.lastRMSLogTime = now; + NSLog(@"[KBAiMainVC] RMS: %.3f", rms); + } +} + +- (void)deepgramStreamingManagerDidReceiveInterimTranscript:(NSString *)text { + self.statusLabel.text = @"正在识别..."; + NSString *displayText = text ?: @""; + if (self.deepgramFullText.length > 0 && displayText.length > 0) { + displayText = + [NSString stringWithFormat:@"%@ %@", self.deepgramFullText, displayText]; + } else if (self.deepgramFullText.length > 0) { + displayText = [self.deepgramFullText copy]; + } + self.transcriptLabel.text = displayText; +} + +- (void)deepgramStreamingManagerDidReceiveFinalTranscript:(NSString *)text { + if (text.length > 0) { + if (self.deepgramFullText.length > 0) { + [self.deepgramFullText appendString:@" "]; + } + [self.deepgramFullText appendString:text]; + } + self.transcriptLabel.text = self.deepgramFullText; + self.statusLabel.text = @"识别完成"; + self.recordButton.state = KBAiRecordButtonStateNormal; +} + +- (void)deepgramStreamingManagerDidFail:(NSError *)error { + self.recordButton.state = KBAiRecordButtonStateNormal; + [self showError:error]; +} + @end diff --git a/keyBoard/Class/AiTalk/VM/AiVM.h b/keyBoard/Class/AiTalk/VM/AiVM.h new file mode 100644 index 0000000..2bd6fe9 --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/AiVM.h @@ -0,0 +1,16 @@ +// +// AiVM.h +// keyBoard +// +// Created by Mac on 2026/1/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface AiVM : NSObject + +@end + +NS_ASSUME_NONNULL_END diff --git a/keyBoard/Class/AiTalk/VM/AiVM.m b/keyBoard/Class/AiTalk/VM/AiVM.m new file mode 100644 index 0000000..322cd55 --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/AiVM.m @@ -0,0 +1,12 @@ +// +// AiVM.m +// keyBoard +// +// Created by Mac on 2026/1/22. +// + +#import "AiVM.h" + +@implementation AiVM + +@end diff --git a/keyBoard/Class/AiTalk/VM/AudioCaptureManager.m b/keyBoard/Class/AiTalk/VM/AudioCaptureManager.m index 64d8b5e..4caeab5 100644 --- a/keyBoard/Class/AiTalk/VM/AudioCaptureManager.m +++ b/keyBoard/Class/AiTalk/VM/AudioCaptureManager.m @@ -179,6 +179,11 @@ static const float kAudioSoftwareGain = 2.5f; } // 获取 Int16 数据 + if (!outputBuffer.int16ChannelData) { + NSLog(@"[AudioCaptureManager] Int16 channel data is null"); + return; + } + int16_t *samples = (int16_t *)outputBuffer.int16ChannelData[0]; NSUInteger sampleCount = outputBuffer.frameLength; NSUInteger byteCount = sampleCount * sizeof(int16_t); @@ -189,13 +194,20 @@ static const float kAudioSoftwareGain = 2.5f; [self calculateAndReportRMS:samples sampleCount:sampleCount]; [self logAudioStatsIfNeeded:samples sampleCount:sampleCount]; + if (byteCount == 0) { + return; + } + + NSData *pcmData = [NSData dataWithBytes:samples length:byteCount]; + // 将数据添加到 ring buffer 并输出完整帧 dispatch_async(self.audioQueue, ^{ - [self appendToRingBuffer:samples byteCount:byteCount]; + [self appendToRingBuffer:(const uint8_t *)pcmData.bytes + byteCount:pcmData.length]; }); } -- (void)appendToRingBuffer:(int16_t *)samples byteCount:(NSUInteger)byteCount { +- (void)appendToRingBuffer:(const uint8_t *)bytes byteCount:(NSUInteger)byteCount { // 将新数据追加到 ring buffer uint8_t *ringBufferBytes = (uint8_t *)self.ringBuffer.mutableBytes; NSUInteger ringBufferLength = self.ringBuffer.length; @@ -208,7 +220,7 @@ static const float kAudioSoftwareGain = 2.5f; NSUInteger copySize = MIN(bytesToCopy, spaceAvailable); memcpy(ringBufferBytes + self.ringBufferWriteIndex, - (uint8_t *)samples + sourceOffset, copySize); + bytes + sourceOffset, copySize); self.ringBufferWriteIndex += copySize; sourceOffset += copySize; bytesToCopy -= copySize; diff --git a/keyBoard/Class/AiTalk/VM/DeepgramStreamingManager.h b/keyBoard/Class/AiTalk/VM/DeepgramStreamingManager.h new file mode 100644 index 0000000..9d1f391 --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/DeepgramStreamingManager.h @@ -0,0 +1,50 @@ +// +// DeepgramStreamingManager.h +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@protocol DeepgramStreamingManagerDelegate +@optional +- (void)deepgramStreamingManagerDidConnect; +- (void)deepgramStreamingManagerDidDisconnect:(NSError *_Nullable)error; +- (void)deepgramStreamingManagerDidUpdateRMS:(float)rms; +- (void)deepgramStreamingManagerDidReceiveInterimTranscript:(NSString *)text; +- (void)deepgramStreamingManagerDidReceiveFinalTranscript:(NSString *)text; +- (void)deepgramStreamingManagerDidFail:(NSError *)error; +@end + +/// Manager for Deepgram live transcription. +@interface DeepgramStreamingManager : NSObject + +@property(nonatomic, weak) id delegate; + +@property(nonatomic, copy) NSString *serverURL; // wss://api.deepgram.com/v1/listen +@property(nonatomic, copy) NSString *apiKey; + +@property(nonatomic, copy, nullable) NSString *language; +@property(nonatomic, copy, nullable) NSString *model; +@property(nonatomic, assign) BOOL punctuate; +@property(nonatomic, assign) BOOL smartFormat; +@property(nonatomic, assign) BOOL interimResults; + +@property(nonatomic, copy) NSString *encoding; // linear16 +@property(nonatomic, assign) double sampleRate; +@property(nonatomic, assign) int channels; + +@property(nonatomic, assign, readonly, getter=isStreaming) BOOL streaming; + +- (void)start; +- (void)prepareConnection; +- (void)stopAndFinalize; +- (void)cancel; +- (void)disconnect; + +@end + +NS_ASSUME_NONNULL_END diff --git a/keyBoard/Class/AiTalk/VM/DeepgramStreamingManager.m b/keyBoard/Class/AiTalk/VM/DeepgramStreamingManager.m new file mode 100644 index 0000000..a961f2b --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/DeepgramStreamingManager.m @@ -0,0 +1,508 @@ +// +// DeepgramStreamingManager.m +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import "DeepgramStreamingManager.h" +#import "AudioCaptureManager.h" +#import "AudioSessionManager.h" +#import "DeepgramWebSocketClient.h" +#import + +static NSString *const kDeepgramStreamingManagerErrorDomain = + @"DeepgramStreamingManager"; + +@interface DeepgramStreamingManager () + +@property(nonatomic, strong) AudioSessionManager *audioSession; +@property(nonatomic, strong) AudioCaptureManager *audioCapture; +@property(nonatomic, strong) DeepgramWebSocketClient *client; +@property(nonatomic, strong) dispatch_queue_t stateQueue; + +@property(nonatomic, assign) BOOL streaming; +@property(nonatomic, strong) NSMutableArray *pendingFrames; +@property(nonatomic, assign) NSUInteger pendingFrameLimit; +@property(nonatomic, assign) BOOL connecting; +@property(nonatomic, assign) BOOL pendingStart; +@property(nonatomic, assign) BOOL keepConnection; +@property(nonatomic, strong) dispatch_source_t keepAliveTimer; +@property(nonatomic, assign) NSInteger reconnectAttempts; +@property(nonatomic, assign) NSInteger maxReconnectAttempts; +@property(nonatomic, assign) BOOL reconnectScheduled; +@property(nonatomic, assign) BOOL appInBackground; +@property(nonatomic, assign) BOOL shouldReconnectOnForeground; + +@end + +@implementation DeepgramStreamingManager + +- (instancetype)init { + self = [super init]; + if (self) { + _stateQueue = dispatch_queue_create("com.keyboard.aitalk.deepgram.manager", + DISPATCH_QUEUE_SERIAL); + + _audioSession = [AudioSessionManager sharedManager]; + _audioSession.delegate = self; + + _audioCapture = [[AudioCaptureManager alloc] init]; + _audioCapture.delegate = self; + + _client = [[DeepgramWebSocketClient alloc] init]; + _client.delegate = self; + + _serverURL = @"wss://api.deepgram.com/v1/listen"; + _encoding = @"linear16"; + _sampleRate = 16000.0; + _channels = 1; + _punctuate = YES; + _smartFormat = YES; + _interimResults = YES; + + _pendingFrames = [[NSMutableArray alloc] init]; + _pendingFrameLimit = 25; + _connecting = NO; + _pendingStart = NO; + _keepConnection = NO; + _reconnectAttempts = 0; + _maxReconnectAttempts = 5; + _reconnectScheduled = NO; + _appInBackground = NO; + _shouldReconnectOnForeground = NO; + + [self setupNotifications]; + } + return self; +} + +- (void)dealloc { + [self removeNotifications]; + [self disconnect]; +} + +- (void)start { + dispatch_async(self.stateQueue, ^{ + if (self.appInBackground) { + self.shouldReconnectOnForeground = YES; + return; + } + self.keepConnection = YES; + self.pendingStart = YES; + self.reconnectAttempts = 0; + if (self.apiKey.length == 0) { + [self reportErrorWithMessage:@"Deepgram API key is required"]; + return; + } + + if (![self.audioSession hasMicrophonePermission]) { + __weak typeof(self) weakSelf = self; + [self.audioSession requestMicrophonePermission:^(BOOL granted) { + __strong typeof(weakSelf) strongSelf = weakSelf; + if (!strongSelf) { + return; + } + if (!granted) { + [strongSelf reportErrorWithMessage:@"Microphone permission denied"]; + return; + } + dispatch_async(strongSelf.stateQueue, ^{ + [strongSelf start]; + }); + }]; + return; + } + + NSError *error = nil; + if (![self.audioSession configureForConversation:&error]) { + [self reportError:error]; + return; + } + + if (![self.audioSession activateSession:&error]) { + [self reportError:error]; + return; + } + + if (![self.audioCapture isCapturing]) { + NSError *captureError = nil; + if (![self.audioCapture startCapture:&captureError]) { + [self reportError:captureError]; + return; + } + } + + NSLog(@"[DeepgramStreamingManager] Start streaming, server: %@", + self.serverURL); + + if (self.client.isConnected) { + [self beginStreamingIfReady]; + return; + } + + [self connectIfNeeded]; + }); +} + +- (void)prepareConnection { + dispatch_async(self.stateQueue, ^{ + if (self.appInBackground) { + self.shouldReconnectOnForeground = YES; + return; + } + self.keepConnection = YES; + self.pendingStart = NO; + self.reconnectAttempts = 0; + + if (self.apiKey.length == 0) { + NSLog(@"[DeepgramStreamingManager] Prepare skipped: API key missing"); + return; + } + + if (self.client.isConnected) { + return; + } + + [self connectIfNeeded]; + }); +} + +- (void)stopAndFinalize { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.pendingFrames removeAllObjects]; + self.pendingStart = NO; + [self.client disableAudioSending]; + [self startKeepAliveIfNeeded]; + }); +} + +- (void)cancel { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.pendingFrames removeAllObjects]; + self.pendingStart = NO; + self.keepConnection = NO; + [self.client disableAudioSending]; + [self stopKeepAlive]; + [self.client disconnect]; + }); +} + +- (void)disconnect { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + [self.pendingFrames removeAllObjects]; + self.pendingStart = NO; + self.keepConnection = NO; + self.shouldReconnectOnForeground = NO; + [self.client disableAudioSending]; + [self stopKeepAlive]; + [self.client disconnect]; + [self.audioSession deactivateSession]; + }); +} + +#pragma mark - AudioCaptureManagerDelegate + +- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame { + if (pcmFrame.length == 0) { + return; + } + + dispatch_async(self.stateQueue, ^{ + if (!self.streaming || !self.client.isConnected) { + [self.pendingFrames addObject:pcmFrame]; + if (self.pendingFrames.count > self.pendingFrameLimit) { + [self.pendingFrames removeObjectAtIndex:0]; + } + return; + } + + [self.client sendAudioPCMFrame:pcmFrame]; + }); +} + +- (void)audioCaptureManagerDidUpdateRMS:(float)rms { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramStreamingManagerDidUpdateRMS:)]) { + [self.delegate deepgramStreamingManagerDidUpdateRMS:rms]; + } + }); +} + +#pragma mark - AudioSessionManagerDelegate + +- (void)audioSessionManagerDidInterrupt:(KBAudioSessionInterruptionType)type { + if (type == KBAudioSessionInterruptionTypeBegan) { + [self cancel]; + } +} + +- (void)audioSessionManagerMicrophonePermissionDenied { + [self reportErrorWithMessage:@"Microphone permission denied"]; +} + +#pragma mark - DeepgramWebSocketClientDelegate + +- (void)deepgramClientDidConnect { + dispatch_async(self.stateQueue, ^{ + self.connecting = NO; + self.reconnectAttempts = 0; + self.reconnectScheduled = NO; + [self beginStreamingIfReady]; + [self startKeepAliveIfNeeded]; + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramStreamingManagerDidConnect)]) { + [self.delegate deepgramStreamingManagerDidConnect]; + } + }); + }); +} + +- (void)deepgramClientDidDisconnect:(NSError *_Nullable)error { + dispatch_async(self.stateQueue, ^{ + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + self.connecting = NO; + [self.audioSession deactivateSession]; + [self stopKeepAlive]; + + if (self.pendingStart || self.keepConnection) { + [self scheduleReconnectWithError:error]; + } + }); + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramStreamingManagerDidDisconnect:)]) { + [self.delegate deepgramStreamingManagerDidDisconnect:error]; + } + }); +} + +- (void)deepgramClientDidReceiveInterimTranscript:(NSString *)text { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramStreamingManagerDidReceiveInterimTranscript:)]) { + [self.delegate deepgramStreamingManagerDidReceiveInterimTranscript:text]; + } + }); +} + +- (void)deepgramClientDidReceiveFinalTranscript:(NSString *)text { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramStreamingManagerDidReceiveFinalTranscript:)]) { + [self.delegate deepgramStreamingManagerDidReceiveFinalTranscript:text]; + } + }); +} + +- (void)deepgramClientDidFail:(NSError *)error { + [self reportError:error]; +} + +#pragma mark - Error Reporting + +- (void)reportError:(NSError *)error { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramStreamingManagerDidFail:)]) { + [self.delegate deepgramStreamingManagerDidFail:error]; + } + }); +} + +- (void)reportErrorWithMessage:(NSString *)message { + NSError *error = [NSError errorWithDomain:kDeepgramStreamingManagerErrorDomain + code:-1 + userInfo:@{ + NSLocalizedDescriptionKey : message ?: @"" + }]; + [self reportError:error]; +} + +- (void)connectIfNeeded { + if (self.connecting || self.client.isConnected) { + return; + } + + if (self.serverURL.length == 0) { + [self reportErrorWithMessage:@"Deepgram server URL is required"]; + return; + } + + self.client.serverURL = self.serverURL; + self.client.apiKey = self.apiKey; + self.client.language = self.language; + self.client.model = self.model; + self.client.punctuate = self.punctuate; + self.client.smartFormat = self.smartFormat; + self.client.interimResults = self.interimResults; + self.client.encoding = self.encoding; + self.client.sampleRate = self.sampleRate; + self.client.channels = self.channels; + [self.client disableAudioSending]; + self.connecting = YES; + [self.client connect]; +} + +- (void)beginStreamingIfReady { + if (!self.pendingStart) { + return; + } + + self.streaming = YES; + [self.client enableAudioSending]; + [self stopKeepAlive]; + + if (self.pendingFrames.count > 0) { + NSArray *frames = [self.pendingFrames copy]; + [self.pendingFrames removeAllObjects]; + for (NSData *frame in frames) { + [self.client sendAudioPCMFrame:frame]; + } + NSLog(@"[DeepgramStreamingManager] Flushed %lu pending frames", + (unsigned long)frames.count); + } +} + +- (void)scheduleReconnectWithError:(NSError *_Nullable)error { + if (self.reconnectScheduled || self.connecting || self.client.isConnected) { + return; + } + + if (self.appInBackground) { + self.shouldReconnectOnForeground = YES; + return; + } + + if (self.reconnectAttempts >= self.maxReconnectAttempts) { + NSLog(@"[DeepgramStreamingManager] Reconnect failed %ld times, stop retry. %@", + (long)self.maxReconnectAttempts, + error.localizedDescription ?: @""); + self.pendingStart = NO; + self.keepConnection = NO; + return; + } + + self.reconnectAttempts += 1; + self.reconnectScheduled = YES; + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, (int64_t)(1 * NSEC_PER_SEC)), + self.stateQueue, ^{ + self.reconnectScheduled = NO; + if (self.appInBackground) { + self.shouldReconnectOnForeground = YES; + return; + } + if (!self.pendingStart && !self.keepConnection) { + return; + } + [self connectIfNeeded]; + }); +} + +- (void)setupNotifications { + NSNotificationCenter *center = [NSNotificationCenter defaultCenter]; + [center addObserver:self + selector:@selector(handleAppDidEnterBackground) + name:UIApplicationDidEnterBackgroundNotification + object:nil]; + [center addObserver:self + selector:@selector(handleAppWillEnterForeground) + name:UIApplicationWillEnterForegroundNotification + object:nil]; +} + +- (void)removeNotifications { + [[NSNotificationCenter defaultCenter] removeObserver:self]; +} + +- (void)handleAppDidEnterBackground { + dispatch_async(self.stateQueue, ^{ + self.appInBackground = YES; + self.shouldReconnectOnForeground = + self.keepConnection || self.pendingStart; + self.pendingStart = NO; + self.keepConnection = NO; + + if (self.streaming) { + [self.audioCapture stopCapture]; + self.streaming = NO; + } + + [self.pendingFrames removeAllObjects]; + [self.client disableAudioSending]; + [self stopKeepAlive]; + [self.client disconnect]; + [self.audioSession deactivateSession]; + + NSLog(@"[DeepgramStreamingManager] App entered background, socket closed"); + }); +} + +- (void)handleAppWillEnterForeground { + dispatch_async(self.stateQueue, ^{ + self.appInBackground = NO; + if (self.shouldReconnectOnForeground) { + self.keepConnection = YES; + self.reconnectAttempts = 0; + [self connectIfNeeded]; + } + self.shouldReconnectOnForeground = NO; + }); +} + +- (void)startKeepAliveIfNeeded { + if (!self.keepConnection || !self.client.isConnected || self.streaming) { + return; + } + + if (self.keepAliveTimer) { + return; + } + + self.keepAliveTimer = + dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, + self.stateQueue); + dispatch_source_set_timer(self.keepAliveTimer, + dispatch_time(DISPATCH_TIME_NOW, 15 * NSEC_PER_SEC), + 15 * NSEC_PER_SEC, 1 * NSEC_PER_SEC); + __weak typeof(self) weakSelf = self; + dispatch_source_set_event_handler(self.keepAliveTimer, ^{ + __strong typeof(weakSelf) strongSelf = weakSelf; + if (!strongSelf) { + return; + } + [strongSelf.client sendKeepAlive]; + }); + dispatch_resume(self.keepAliveTimer); +} + +- (void)stopKeepAlive { + if (self.keepAliveTimer) { + dispatch_source_cancel(self.keepAliveTimer); + self.keepAliveTimer = nil; + } +} + +@end diff --git a/keyBoard/Class/AiTalk/VM/DeepgramWebSocketClient.h b/keyBoard/Class/AiTalk/VM/DeepgramWebSocketClient.h new file mode 100644 index 0000000..730ebca --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/DeepgramWebSocketClient.h @@ -0,0 +1,52 @@ +// +// DeepgramWebSocketClient.h +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@protocol DeepgramWebSocketClientDelegate +@optional +- (void)deepgramClientDidConnect; +- (void)deepgramClientDidDisconnect:(NSError *_Nullable)error; +- (void)deepgramClientDidReceiveInterimTranscript:(NSString *)text; +- (void)deepgramClientDidReceiveFinalTranscript:(NSString *)text; +- (void)deepgramClientDidFail:(NSError *)error; +@end + +/// WebSocket client for Deepgram live transcription. +@interface DeepgramWebSocketClient : NSObject + +@property(nonatomic, weak) id delegate; + +@property(nonatomic, copy) NSString *serverURL; // wss://api.deepgram.com/v1/listen +@property(nonatomic, copy) NSString *apiKey; + +@property(nonatomic, copy, nullable) NSString *language; +@property(nonatomic, copy, nullable) NSString *model; +@property(nonatomic, assign) BOOL punctuate; +@property(nonatomic, assign) BOOL smartFormat; +@property(nonatomic, assign) BOOL interimResults; + +@property(nonatomic, copy) NSString *encoding; // linear16 +@property(nonatomic, assign) double sampleRate; +@property(nonatomic, assign) int channels; + +@property(nonatomic, assign, readonly, getter=isConnected) BOOL connected; + +- (void)connect; +- (void)disconnect; +- (void)sendAudioPCMFrame:(NSData *)pcmFrame; +- (void)finish; +- (void)sendKeepAlive; + +- (void)enableAudioSending; +- (void)disableAudioSending; + +@end + +NS_ASSUME_NONNULL_END diff --git a/keyBoard/Class/AiTalk/VM/DeepgramWebSocketClient.m b/keyBoard/Class/AiTalk/VM/DeepgramWebSocketClient.m new file mode 100644 index 0000000..a9ec157 --- /dev/null +++ b/keyBoard/Class/AiTalk/VM/DeepgramWebSocketClient.m @@ -0,0 +1,413 @@ +// +// DeepgramWebSocketClient.m +// keyBoard +// +// Created by Mac on 2026/1/21. +// + +#import "DeepgramWebSocketClient.h" + +static NSString *const kDeepgramWebSocketClientErrorDomain = + @"DeepgramWebSocketClient"; + +@interface DeepgramWebSocketClient () + +@property(nonatomic, strong) NSURLSession *urlSession; +@property(nonatomic, strong) NSURLSessionWebSocketTask *webSocketTask; +@property(nonatomic, strong) dispatch_queue_t networkQueue; +@property(nonatomic, assign) BOOL connected; +@property(nonatomic, assign) BOOL audioSendingEnabled; + +@end + +@implementation DeepgramWebSocketClient + +- (instancetype)init { + self = [super init]; + if (self) { + _networkQueue = dispatch_queue_create("com.keyboard.aitalk.deepgram.ws", + DISPATCH_QUEUE_SERIAL); + _serverURL = @"wss://api.deepgram.com/v1/listen"; + _encoding = @"linear16"; + _sampleRate = 16000.0; + _channels = 1; + _punctuate = YES; + _smartFormat = YES; + _interimResults = YES; + _audioSendingEnabled = NO; + } + return self; +} + +- (void)dealloc { + [self disconnect]; +} + +#pragma mark - Public Methods + +- (void)connect { + dispatch_async(self.networkQueue, ^{ + [self disconnectInternal]; + + if (self.apiKey.length == 0) { + [self reportErrorWithMessage:@"Deepgram API key is required"]; + return; + } + + NSURL *url = [self buildURL]; + if (!url) { + [self reportErrorWithMessage:@"Invalid Deepgram URL"]; + return; + } + + NSLog(@"[DeepgramWebSocketClient] Connecting: %@", url.absoluteString); + + NSURLSessionConfiguration *config = + [NSURLSessionConfiguration defaultSessionConfiguration]; + config.timeoutIntervalForRequest = 30; + config.timeoutIntervalForResource = 300; + + self.urlSession = [NSURLSession sessionWithConfiguration:config + delegate:self + delegateQueue:nil]; + + NSMutableURLRequest *request = [NSMutableURLRequest requestWithURL:url]; + [request setValue:[NSString stringWithFormat:@"Token %@", self.apiKey] + forHTTPHeaderField:@"Authorization"]; + + self.webSocketTask = [self.urlSession webSocketTaskWithRequest:request]; + [self.webSocketTask resume]; + [self receiveMessage]; + }); +} + +- (void)disconnect { + dispatch_async(self.networkQueue, ^{ + BOOL shouldNotify = self.webSocketTask != nil; + if (shouldNotify) { + NSLog(@"[DeepgramWebSocketClient] Disconnect requested"); + } + [self disconnectInternal]; + if (shouldNotify) { + [self notifyDisconnect:nil]; + } + }); +} + +- (void)sendAudioPCMFrame:(NSData *)pcmFrame { + if (!self.connected || !self.webSocketTask || pcmFrame.length == 0) { + return; + } + + dispatch_async(self.networkQueue, ^{ + if (!self.audioSendingEnabled) { + return; + } + if (!self.connected || !self.webSocketTask) { + return; + } + + NSURLSessionWebSocketMessage *message = + [[NSURLSessionWebSocketMessage alloc] initWithData:pcmFrame]; + [self.webSocketTask + sendMessage:message + completionHandler:^(NSError *_Nullable error) { + if (error) { + [self reportError:error]; + } else { + NSLog(@"[DeepgramWebSocketClient] Sent audio frame: %lu bytes", + (unsigned long)pcmFrame.length); + } + }]; + }); +} + +- (void)finish { + NSLog(@"[DeepgramWebSocketClient] Sending CloseStream"); + [self sendJSON:@{ @"type" : @"CloseStream" }]; +} + +- (void)sendKeepAlive { + if (!self.connected || !self.webSocketTask) { + return; + } + [self sendJSON:@{ @"type" : @"KeepAlive" }]; +} + +- (void)enableAudioSending { + dispatch_async(self.networkQueue, ^{ + self.audioSendingEnabled = YES; + }); +} + +- (void)disableAudioSending { + dispatch_async(self.networkQueue, ^{ + self.audioSendingEnabled = NO; + }); +} + +#pragma mark - Private Methods + +- (NSURL *)buildURL { + if (self.serverURL.length == 0) { + return nil; + } + + NSURLComponents *components = + [NSURLComponents componentsWithString:self.serverURL]; + if (!components) { + return nil; + } + + NSMutableArray *items = + components.queryItems.mutableCopy ?: [NSMutableArray array]; + + [self upsertQueryItemWithName:@"model" value:self.model items:items]; + [self upsertQueryItemWithName:@"language" value:self.language items:items]; + + [self upsertQueryItemWithName:@"punctuate" + value:(self.punctuate ? @"true" : @"false") + items:items]; + [self upsertQueryItemWithName:@"smart_format" + value:(self.smartFormat ? @"true" : @"false") + items:items]; + [self upsertQueryItemWithName:@"interim_results" + value:(self.interimResults ? @"true" : @"false") + items:items]; + + [self upsertQueryItemWithName:@"encoding" value:self.encoding items:items]; + [self upsertQueryItemWithName:@"sample_rate" + value:[NSString stringWithFormat:@"%.0f", + self.sampleRate] + items:items]; + [self upsertQueryItemWithName:@"channels" + value:[NSString stringWithFormat:@"%d", self.channels] + items:items]; + + components.queryItems = items; + return components.URL; +} + +- (void)upsertQueryItemWithName:(NSString *)name + value:(NSString *)value + items:(NSMutableArray *)items { + if (name.length == 0 || value.length == 0) { + return; + } + + for (NSUInteger i = 0; i < items.count; i++) { + NSURLQueryItem *item = items[i]; + if ([item.name isEqualToString:name]) { + items[i] = [NSURLQueryItem queryItemWithName:name value:value]; + return; + } + } + + [items addObject:[NSURLQueryItem queryItemWithName:name value:value]]; +} + +- (void)sendJSON:(NSDictionary *)dict { + if (!self.webSocketTask) { + return; + } + + NSError *jsonError = nil; + NSData *jsonData = [NSJSONSerialization dataWithJSONObject:dict + options:0 + error:&jsonError]; + if (jsonError) { + [self reportError:jsonError]; + return; + } + + NSString *jsonString = + [[NSString alloc] initWithData:jsonData + encoding:NSUTF8StringEncoding]; + if (!jsonString) { + [self reportErrorWithMessage:@"Failed to encode JSON message"]; + return; + } + + dispatch_async(self.networkQueue, ^{ + NSURLSessionWebSocketMessage *message = + [[NSURLSessionWebSocketMessage alloc] initWithString:jsonString]; + [self.webSocketTask + sendMessage:message + completionHandler:^(NSError *_Nullable error) { + if (error) { + [self reportError:error]; + } + }]; + }); +} + +- (void)receiveMessage { + if (!self.webSocketTask) { + return; + } + + __weak typeof(self) weakSelf = self; + [self.webSocketTask receiveMessageWithCompletionHandler:^( + NSURLSessionWebSocketMessage *_Nullable message, + NSError *_Nullable error) { + __strong typeof(weakSelf) strongSelf = weakSelf; + if (!strongSelf) { + return; + } + + if (error) { + if (error.code != NSURLErrorCancelled && error.code != 57) { + [strongSelf notifyDisconnect:error]; + [strongSelf disconnectInternal]; + } + return; + } + + if (message.type == NSURLSessionWebSocketMessageTypeString) { + NSLog(@"[DeepgramWebSocketClient] Received text: %@", message.string); + [strongSelf handleTextMessage:message.string]; + } else if (message.type == NSURLSessionWebSocketMessageTypeData) { + NSLog(@"[DeepgramWebSocketClient] Received binary: %lu bytes", + (unsigned long)message.data.length); + [strongSelf handleBinaryMessage:message.data]; + } + + [strongSelf receiveMessage]; + }]; +} + +- (void)handleTextMessage:(NSString *)text { + if (text.length == 0) { + return; + } + + NSData *data = [text dataUsingEncoding:NSUTF8StringEncoding]; + if (!data) { + return; + } + + NSError *jsonError = nil; + NSDictionary *json = [NSJSONSerialization JSONObjectWithData:data + options:0 + error:&jsonError]; + if (jsonError) { + [self reportError:jsonError]; + return; + } + + NSString *errorMessage = json[@"error"]; + if (errorMessage.length > 0) { + [self reportErrorWithMessage:errorMessage]; + return; + } + + NSDictionary *channel = json[@"channel"]; + if (![channel isKindOfClass:[NSDictionary class]]) { + return; + } + + NSArray *alternatives = channel[@"alternatives"]; + if (![alternatives isKindOfClass:[NSArray class]] || alternatives.count == 0) { + return; + } + + NSDictionary *firstAlt = alternatives.firstObject; + NSString *transcript = firstAlt[@"transcript"] ?: @""; + BOOL isFinal = [json[@"is_final"] boolValue] || + [json[@"speech_final"] boolValue]; + + if (transcript.length == 0) { + return; + } + + dispatch_async(dispatch_get_main_queue(), ^{ + if (isFinal) { + if ([self.delegate respondsToSelector:@selector + (deepgramClientDidReceiveFinalTranscript:)]) { + [self.delegate deepgramClientDidReceiveFinalTranscript:transcript]; + } + } else { + if ([self.delegate respondsToSelector:@selector + (deepgramClientDidReceiveInterimTranscript:)]) { + [self.delegate deepgramClientDidReceiveInterimTranscript:transcript]; + } + } + }); +} + +- (void)handleBinaryMessage:(NSData *)data { +} + +- (void)disconnectInternal { + self.connected = NO; + self.audioSendingEnabled = NO; + + if (self.webSocketTask) { + [self.webSocketTask + cancelWithCloseCode:NSURLSessionWebSocketCloseCodeNormalClosure + reason:nil]; + self.webSocketTask = nil; + } + + if (self.urlSession) { + [self.urlSession invalidateAndCancel]; + self.urlSession = nil; + } +} + +- (void)reportError:(NSError *)error { + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector(deepgramClientDidFail:)]) { + [self.delegate deepgramClientDidFail:error]; + } + }); +} + +- (void)reportErrorWithMessage:(NSString *)message { + NSError *error = [NSError errorWithDomain:kDeepgramWebSocketClientErrorDomain + code:-1 + userInfo:@{ + NSLocalizedDescriptionKey : message ?: @"" + }]; + [self reportError:error]; +} + +- (void)notifyDisconnect:(NSError *_Nullable)error { + self.connected = NO; + + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector + (deepgramClientDidDisconnect:)]) { + [self.delegate deepgramClientDidDisconnect:error]; + } + }); +} + +#pragma mark - NSURLSessionWebSocketDelegate + +- (void)URLSession:(NSURLSession *)session + webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask + didOpenWithProtocol:(NSString *)protocol { + self.connected = YES; + NSLog(@"[DeepgramWebSocketClient] Connected"); + dispatch_async(dispatch_get_main_queue(), ^{ + if ([self.delegate respondsToSelector:@selector(deepgramClientDidConnect)]) { + [self.delegate deepgramClientDidConnect]; + } + }); +} + +- (void)URLSession:(NSURLSession *)session + webSocketTask:(NSURLSessionWebSocketTask *)webSocketTask + didCloseWithCode:(NSURLSessionWebSocketCloseCode)closeCode + reason:(NSData *)reason { + if (!self.webSocketTask) { + return; + } + NSLog(@"[DeepgramWebSocketClient] Closed with code: %ld", + (long)closeCode); + [self notifyDisconnect:nil]; + [self disconnectInternal]; +} + +@end diff --git a/keyBoard/Class/AiTalk/deepgramAPI.md b/keyBoard/Class/AiTalk/deepgramAPI.md new file mode 100644 index 0000000..208be33 --- /dev/null +++ b/keyBoard/Class/AiTalk/deepgramAPI.md @@ -0,0 +1,1119 @@ +# Pre-Recorded Audio + +POST https://api.deepgram.com/v1/listen +Content-Type: application/json + +Transcribe audio and video using Deepgram's speech-to-text REST API + +Reference: https://developers.deepgram.com/reference/speech-to-text/listen-pre-recorded + +## OpenAPI Specification + +```yaml +openapi: 3.1.1 +info: + title: Transcribe and analyze pre-recorded audio and video + version: endpoint_listen/v1/media.transcribe +paths: + /v1/listen: + post: + operationId: transcribe + summary: Transcribe and analyze pre-recorded audio and video + description: Transcribe audio and video using Deepgram's speech-to-text REST API + tags: + - - subpackage_listen + - subpackage_listen/v1 + - subpackage_listen/v1/media + parameters: + - name: callback + in: query + description: URL to which we'll make the callback request + required: false + schema: + type: string + - name: callback_method + in: query + description: HTTP method by which the callback request will be made + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersCallbackMethod' + - name: extra + in: query + description: >- + Arbitrary key-value pairs that are attached to the API response for + usage in downstream processing + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersExtra' + - name: sentiment + in: query + description: Recognizes the sentiment throughout a transcript or text + required: false + schema: + type: boolean + default: false + - name: summarize + in: query + description: >- + Summarize content. For Listen API, supports string version option. + For Read API, accepts boolean only. + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersSummarize' + - name: tag + in: query + description: >- + Label your requests for the purpose of identification during usage + reporting + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersTag' + - name: topics + in: query + description: Detect topics throughout a transcript or text + required: false + schema: + type: boolean + default: false + - name: custom_topic + in: query + description: >- + Custom topics you want the model to detect within your input audio + or text if present Submit up to `100`. + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersCustomTopic' + - name: custom_topic_mode + in: query + description: >- + Sets how the model will interpret strings submitted to the + `custom_topic` param. When `strict`, the model will only return + topics submitted using the `custom_topic` param. When `extended`, + the model will return its own detected topics in addition to those + submitted using the `custom_topic` param + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersCustomTopicMode' + - name: intents + in: query + description: Recognizes speaker intent throughout a transcript or text + required: false + schema: + type: boolean + default: false + - name: custom_intent + in: query + description: >- + Custom intents you want the model to detect within your input audio + if present + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersCustomIntent' + - name: custom_intent_mode + in: query + description: >- + Sets how the model will interpret intents submitted to the + `custom_intent` param. When `strict`, the model will only return + intents submitted using the `custom_intent` param. When `extended`, + the model will return its own detected intents in the + `custom_intent` param. + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersCustomIntentMode' + - name: detect_entities + in: query + description: Identifies and extracts key entities from content in submitted audio + required: false + schema: + type: boolean + default: false + - name: detect_language + in: query + description: Identifies the dominant language spoken in submitted audio + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersDetectLanguage' + - name: diarize + in: query + description: >- + Recognize speaker changes. Each word in the transcript will be + assigned a speaker number starting at 0 + required: false + schema: + type: boolean + default: false + - name: dictation + in: query + description: Dictation mode for controlling formatting with dictated speech + required: false + schema: + type: boolean + default: false + - name: encoding + in: query + description: Specify the expected encoding of your submitted audio + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersEncoding' + - name: filler_words + in: query + description: >- + Filler Words can help transcribe interruptions in your audio, like + "uh" and "um" + required: false + schema: + type: boolean + default: false + - name: keyterm + in: query + description: >- + Key term prompting can boost or suppress specialized terminology and + brands. Only compatible with Nova-3 + required: false + schema: + type: array + items: + type: string + - name: keywords + in: query + description: Keywords can boost or suppress specialized terminology and brands + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersKeywords' + - name: language + in: query + description: >- + The [BCP-47 language tag](https://tools.ietf.org/html/bcp47) that + hints at the primary spoken language. Depending on the Model and API + endpoint you choose only certain languages are available + required: false + schema: + type: string + default: en + - name: measurements + in: query + description: >- + Spoken measurements will be converted to their corresponding + abbreviations + required: false + schema: + type: boolean + default: false + - name: model + in: query + description: AI model used to process submitted audio + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersModel' + - name: multichannel + in: query + description: Transcribe each audio channel independently + required: false + schema: + type: boolean + default: false + - name: numerals + in: query + description: Numerals converts numbers from written format to numerical format + required: false + schema: + type: boolean + default: false + - name: paragraphs + in: query + description: Splits audio into paragraphs to improve transcript readability + required: false + schema: + type: boolean + default: false + - name: profanity_filter + in: query + description: >- + Profanity Filter looks for recognized profanity and converts it to + the nearest recognized non-profane word or removes it from the + transcript completely + required: false + schema: + type: boolean + default: false + - name: punctuate + in: query + description: Add punctuation and capitalization to the transcript + required: false + schema: + type: boolean + default: false + - name: redact + in: query + description: Redaction removes sensitive information from your transcripts + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersRedact' + - name: replace + in: query + description: Search for terms or phrases in submitted audio and replaces them + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersReplace' + - name: search + in: query + description: Search for terms or phrases in submitted audio + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersSearch' + - name: smart_format + in: query + description: >- + Apply formatting to transcript output. When set to true, additional + formatting will be applied to transcripts to improve readability + required: false + schema: + type: boolean + default: false + - name: utterances + in: query + description: Segments speech into meaningful semantic units + required: false + schema: + type: boolean + default: false + - name: utt_split + in: query + description: >- + Seconds to wait before detecting a pause between words in submitted + audio + required: false + schema: + type: number + format: double + default: 0.8 + - name: version + in: query + description: Version of an AI model to use + required: false + schema: + $ref: '#/components/schemas/V1ListenPostParametersVersion' + - name: mip_opt_out + in: query + description: >- + Opts out requests from the Deepgram Model Improvement Program. Refer + to our Docs for pricing impacts before setting this to true. + https://dpgr.am/deepgram-mip + required: false + schema: + type: boolean + default: false + - name: Authorization + in: header + description: Header authentication of the form `undefined ` + required: true + schema: + type: string + responses: + '200': + description: >- + Returns either transcription results, or a request_id when using a + callback. + content: + application/json: + schema: + $ref: '#/components/schemas/listen_v1_media_transcribe_Response_200' + '400': + description: Invalid Request + content: {} + requestBody: + description: Transcribe an audio or video file + content: + application/json: + schema: + $ref: '#/components/schemas/ListenV1RequestUrl' +components: + schemas: + V1ListenPostParametersCallbackMethod: + type: string + enum: + - value: POST + - value: PUT + default: POST + V1ListenPostParametersExtra: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersSummarize0: + type: string + enum: + - value: v2 + V1ListenPostParametersSummarize: + oneOf: + - $ref: '#/components/schemas/V1ListenPostParametersSummarize0' + - type: boolean + default: false + V1ListenPostParametersTag: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersCustomTopic: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersCustomTopicMode: + type: string + enum: + - value: extended + - value: strict + default: extended + V1ListenPostParametersCustomIntent: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersCustomIntentMode: + type: string + enum: + - value: extended + - value: strict + default: extended + V1ListenPostParametersDetectLanguage: + oneOf: + - type: boolean + default: false + - type: array + items: + type: string + V1ListenPostParametersEncoding: + type: string + enum: + - value: linear16 + - value: flac + - value: mulaw + - value: amr-nb + - value: amr-wb + - value: opus + - value: speex + - value: g729 + V1ListenPostParametersKeywords: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersModel0: + type: string + enum: + - value: nova-3 + - value: nova-3-general + - value: nova-3-medical + - value: nova-2 + - value: nova-2-general + - value: nova-2-meeting + - value: nova-2-finance + - value: nova-2-conversationalai + - value: nova-2-voicemail + - value: nova-2-video + - value: nova-2-medical + - value: nova-2-drivethru + - value: nova-2-automotive + - value: nova + - value: nova-general + - value: nova-phonecall + - value: nova-medical + - value: enhanced + - value: enhanced-general + - value: enhanced-meeting + - value: enhanced-phonecall + - value: enhanced-finance + - value: base + - value: meeting + - value: phonecall + - value: finance + - value: conversationalai + - value: voicemail + - value: video + V1ListenPostParametersModel: + oneOf: + - $ref: '#/components/schemas/V1ListenPostParametersModel0' + - type: string + V1ListenPostParametersRedactSchemaOneOf1Items: + type: string + enum: + - value: pci + - value: pii + - value: numbers + V1ListenPostParametersRedact1: + type: array + items: + $ref: '#/components/schemas/V1ListenPostParametersRedactSchemaOneOf1Items' + V1ListenPostParametersRedact: + oneOf: + - type: string + - $ref: '#/components/schemas/V1ListenPostParametersRedact1' + V1ListenPostParametersReplace: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersSearch: + oneOf: + - type: string + - type: array + items: + type: string + V1ListenPostParametersVersion0: + type: string + enum: + - value: latest + V1ListenPostParametersVersion: + oneOf: + - $ref: '#/components/schemas/V1ListenPostParametersVersion0' + - type: string + ListenV1RequestUrl: + type: object + properties: + url: + type: string + format: uri + required: + - url + ListenV1ResponseMetadataModelInfo: + type: object + properties: {} + ListenV1ResponseMetadataSummaryInfo: + type: object + properties: + model_uuid: + type: string + input_tokens: + type: number + format: double + output_tokens: + type: number + format: double + ListenV1ResponseMetadataSentimentInfo: + type: object + properties: + model_uuid: + type: string + input_tokens: + type: number + format: double + output_tokens: + type: number + format: double + ListenV1ResponseMetadataTopicsInfo: + type: object + properties: + model_uuid: + type: string + input_tokens: + type: number + format: double + output_tokens: + type: number + format: double + ListenV1ResponseMetadataIntentsInfo: + type: object + properties: + model_uuid: + type: string + input_tokens: + type: number + format: double + output_tokens: + type: number + format: double + ListenV1ResponseMetadata: + type: object + properties: + transaction_key: + type: string + default: deprecated + request_id: + type: string + format: uuid + sha256: + type: string + created: + type: string + format: date-time + duration: + type: number + format: double + channels: + type: number + format: double + models: + type: array + items: + type: string + model_info: + $ref: '#/components/schemas/ListenV1ResponseMetadataModelInfo' + summary_info: + $ref: '#/components/schemas/ListenV1ResponseMetadataSummaryInfo' + sentiment_info: + $ref: '#/components/schemas/ListenV1ResponseMetadataSentimentInfo' + topics_info: + $ref: '#/components/schemas/ListenV1ResponseMetadataTopicsInfo' + intents_info: + $ref: '#/components/schemas/ListenV1ResponseMetadataIntentsInfo' + tags: + type: array + items: + type: string + required: + - request_id + - sha256 + - created + - duration + - channels + - models + - model_info + ListenV1ResponseResultsChannelsItemsSearchItemsHitsItems: + type: object + properties: + confidence: + type: number + format: double + start: + type: number + format: double + end: + type: number + format: double + snippet: + type: string + ListenV1ResponseResultsChannelsItemsSearchItems: + type: object + properties: + query: + type: string + hits: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsSearchItemsHitsItems + ListenV1ResponseResultsChannelsItemsAlternativesItemsWordsItems: + type: object + properties: + word: + type: string + start: + type: number + format: double + end: + type: number + format: double + confidence: + type: number + format: double + ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItemsSentencesItems: + type: object + properties: + text: + type: string + start: + type: number + format: double + end: + type: number + format: double + ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItems: + type: object + properties: + sentences: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItemsSentencesItems + speaker: + type: number + format: double + num_words: + type: number + format: double + start: + type: number + format: double + end: + type: number + format: double + ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphs: + type: object + properties: + transcript: + type: string + paragraphs: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItems + ListenV1ResponseResultsChannelsItemsAlternativesItemsEntitiesItems: + type: object + properties: + label: + type: string + value: + type: string + raw_value: + type: string + confidence: + type: number + format: double + start_word: + type: number + format: double + end_word: + type: number + format: double + ListenV1ResponseResultsChannelsItemsAlternativesItemsSummariesItems: + type: object + properties: + summary: + type: string + start_word: + type: number + format: double + end_word: + type: number + format: double + ListenV1ResponseResultsChannelsItemsAlternativesItemsTopicsItems: + type: object + properties: + text: + type: string + start_word: + type: number + format: double + end_word: + type: number + format: double + topics: + type: array + items: + type: string + ListenV1ResponseResultsChannelsItemsAlternativesItems: + type: object + properties: + transcript: + type: string + confidence: + type: number + format: double + words: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsWordsItems + paragraphs: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphs + entities: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsEntitiesItems + summaries: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsSummariesItems + topics: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsTopicsItems + ListenV1ResponseResultsChannelsItems: + type: object + properties: + search: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsSearchItems + alternatives: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItems + detected_language: + type: string + ListenV1ResponseResultsChannels: + type: array + items: + $ref: '#/components/schemas/ListenV1ResponseResultsChannelsItems' + ListenV1ResponseResultsUtterancesItemsWordsItems: + type: object + properties: + word: + type: string + start: + type: number + format: double + end: + type: number + format: double + confidence: + type: number + format: double + speaker: + type: number + format: double + speaker_confidence: + type: number + format: double + punctuated_word: + type: string + ListenV1ResponseResultsUtterancesItems: + type: object + properties: + start: + type: number + format: double + end: + type: number + format: double + confidence: + type: number + format: double + channel: + type: number + format: double + transcript: + type: string + words: + type: array + items: + $ref: >- + #/components/schemas/ListenV1ResponseResultsUtterancesItemsWordsItems + speaker: + type: number + format: double + id: + type: string + format: uuid + ListenV1ResponseResultsUtterances: + type: array + items: + $ref: '#/components/schemas/ListenV1ResponseResultsUtterancesItems' + ListenV1ResponseResultsSummary: + type: object + properties: + result: + type: string + short: + type: string + SharedTopicsResultsTopicsSegmentsItemsTopicsItems: + type: object + properties: + topic: + type: string + confidence_score: + type: number + format: double + SharedTopicsResultsTopicsSegmentsItems: + type: object + properties: + text: + type: string + start_word: + type: number + format: double + end_word: + type: number + format: double + topics: + type: array + items: + $ref: >- + #/components/schemas/SharedTopicsResultsTopicsSegmentsItemsTopicsItems + SharedTopicsResultsTopics: + type: object + properties: + segments: + type: array + items: + $ref: '#/components/schemas/SharedTopicsResultsTopicsSegmentsItems' + SharedTopicsResults: + type: object + properties: + topics: + $ref: '#/components/schemas/SharedTopicsResultsTopics' + SharedTopics: + type: object + properties: + results: + $ref: '#/components/schemas/SharedTopicsResults' + SharedIntentsResultsIntentsSegmentsItemsIntentsItems: + type: object + properties: + intent: + type: string + confidence_score: + type: number + format: double + SharedIntentsResultsIntentsSegmentsItems: + type: object + properties: + text: + type: string + start_word: + type: number + format: double + end_word: + type: number + format: double + intents: + type: array + items: + $ref: >- + #/components/schemas/SharedIntentsResultsIntentsSegmentsItemsIntentsItems + SharedIntentsResultsIntents: + type: object + properties: + segments: + type: array + items: + $ref: '#/components/schemas/SharedIntentsResultsIntentsSegmentsItems' + SharedIntentsResults: + type: object + properties: + intents: + $ref: '#/components/schemas/SharedIntentsResultsIntents' + SharedIntents: + type: object + properties: + results: + $ref: '#/components/schemas/SharedIntentsResults' + SharedSentimentsSegmentsItems: + type: object + properties: + text: + type: string + start_word: + type: number + format: double + end_word: + type: number + format: double + sentiment: + type: string + sentiment_score: + type: number + format: double + SharedSentimentsAverage: + type: object + properties: + sentiment: + type: string + sentiment_score: + type: number + format: double + SharedSentiments: + type: object + properties: + segments: + type: array + items: + $ref: '#/components/schemas/SharedSentimentsSegmentsItems' + average: + $ref: '#/components/schemas/SharedSentimentsAverage' + ListenV1ResponseResults: + type: object + properties: + channels: + $ref: '#/components/schemas/ListenV1ResponseResultsChannels' + utterances: + $ref: '#/components/schemas/ListenV1ResponseResultsUtterances' + summary: + $ref: '#/components/schemas/ListenV1ResponseResultsSummary' + topics: + $ref: '#/components/schemas/SharedTopics' + intents: + $ref: '#/components/schemas/SharedIntents' + sentiments: + $ref: '#/components/schemas/SharedSentiments' + required: + - channels + ListenV1Response: + type: object + properties: + metadata: + $ref: '#/components/schemas/ListenV1ResponseMetadata' + results: + $ref: '#/components/schemas/ListenV1ResponseResults' + required: + - metadata + - results + ListenV1AcceptedResponse: + type: object + properties: + request_id: + type: string + format: uuid + description: Unique identifier for tracking the asynchronous request + required: + - request_id + listen_v1_media_transcribe_Response_200: + oneOf: + - $ref: '#/components/schemas/ListenV1Response' + - $ref: '#/components/schemas/ListenV1AcceptedResponse' + +``` + +## SDK Code Examples + +```python +import requests + +url = "https://api.deepgram.com/v1/listen" + +payload = { "url": "https://dpgr.am/spacewalk.wav" } +headers = { + "Authorization": "", + "Content-Type": "application/json" +} + +response = requests.post(url, json=payload, headers=headers) + +print(response.json()) +``` + +```javascript +const url = 'https://api.deepgram.com/v1/listen'; +const options = { + method: 'POST', + headers: {Authorization: '', 'Content-Type': 'application/json'}, + body: '{"url":"https://dpgr.am/spacewalk.wav"}' +}; + +try { + const response = await fetch(url, options); + const data = await response.json(); + console.log(data); +} catch (error) { + console.error(error); +} +``` + +```go +package main + +import ( + "fmt" + "strings" + "net/http" + "io" +) + +func main() { + + url := "https://api.deepgram.com/v1/listen" + + payload := strings.NewReader("{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}") + + req, _ := http.NewRequest("POST", url, payload) + + req.Header.Add("Authorization", "") + req.Header.Add("Content-Type", "application/json") + + res, _ := http.DefaultClient.Do(req) + + defer res.Body.Close() + body, _ := io.ReadAll(res.Body) + + fmt.Println(res) + fmt.Println(string(body)) + +} +``` + +```ruby +require 'uri' +require 'net/http' + +url = URI("https://api.deepgram.com/v1/listen") + +http = Net::HTTP.new(url.host, url.port) +http.use_ssl = true + +request = Net::HTTP::Post.new(url) +request["Authorization"] = '' +request["Content-Type"] = 'application/json' +request.body = "{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}" + +response = http.request(request) +puts response.read_body +``` + +```java +HttpResponse response = Unirest.post("https://api.deepgram.com/v1/listen") + .header("Authorization", "") + .header("Content-Type", "application/json") + .body("{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}") + .asString(); +``` + +```php +request('POST', 'https://api.deepgram.com/v1/listen', [ + 'body' => '{ + "url": "https://dpgr.am/spacewalk.wav" +}', + 'headers' => [ + 'Authorization' => '', + 'Content-Type' => 'application/json', + ], +]); + +echo $response->getBody(); +``` + +```csharp +var client = new RestClient("https://api.deepgram.com/v1/listen"); +var request = new RestRequest(Method.POST); +request.AddHeader("Authorization", ""); +request.AddHeader("Content-Type", "application/json"); +request.AddParameter("application/json", "{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}", ParameterType.RequestBody); +IRestResponse response = client.Execute(request); +``` + +```swift +import Foundation + +let headers = [ + "Authorization": "", + "Content-Type": "application/json" +] +let parameters = ["url": "https://dpgr.am/spacewalk.wavhttps://dpgr.am/spacewalk.wav"] as [String : Any] + +let postData = JSONSerialization.data(withJSONObject: parameters, options: []) + +let request = NSMutableURLRequest(url: NSURL(string: "https://api.deepgram.com/v1/listen")! as URL, + cachePolicy: .useProtocolCachePolicy, + timeoutInterval: 10.0) +request.httpMethod = "POST" +request.allHTTPHeaderFields = headers +request.httpBody = postData as Data + +let session = URLSession.shared +let dataTask = session.dataTask(with: request as URLRequest, completionHandler: { (data, response, error) -> Void in + if (error != nil) { + print(error as Any) + } else { + let httpResponse = response as? HTTPURLResponse + print(httpResponse) + } +}) + +dataTask.resume() +```