From 9fb2e2e694224779b2dde186c71458b775b6384b Mon Sep 17 00:00:00 2001 From: CodeST <694468528@qq.com> Date: Sat, 7 Mar 2026 18:49:34 +0800 Subject: [PATCH] 1 --- keyBoard.xcodeproj/project.pbxproj | 183 ++-- keyBoard/Class/AiTalk/AI技术分析.txt | 521 ----------- keyBoard/Class/AiTalk/deepgramAPI.md | 1119 ------------------------ keyBoard/Class/AiTalk/websocket-api.md | 771 ---------------- 4 files changed, 85 insertions(+), 2509 deletions(-) delete mode 100644 keyBoard/Class/AiTalk/AI技术分析.txt delete mode 100644 keyBoard/Class/AiTalk/deepgramAPI.md delete mode 100644 keyBoard/Class/AiTalk/websocket-api.md diff --git a/keyBoard.xcodeproj/project.pbxproj b/keyBoard.xcodeproj/project.pbxproj index d832132..7a27809 100644 --- a/keyBoard.xcodeproj/project.pbxproj +++ b/keyBoard.xcodeproj/project.pbxproj @@ -78,7 +78,6 @@ 045ED5282F53F4B000131114 /* KBInputProfileManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 045ED5262F53F4AF00131114 /* KBInputProfileManager.m */; }; 045ED52B2F540FBE00131114 /* normal_hei_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 045ED5292F540FBE00131114 /* normal_hei_them.zip */; }; 045ED52C2F540FBE00131114 /* normal_them.zip in Resources */ = {isa = PBXBuildFile; fileRef = 045ED52A2F540FBE00131114 /* normal_them.zip */; }; - 046086752F191CC700757C95 /* AI技术分析.txt in Resources */ = {isa = PBXBuildFile; fileRef = 046086742F191CC700757C95 /* AI技术分析.txt */; }; 0460869A2F19238500757C95 /* KBAiWaveformView.m in Sources */ = {isa = PBXBuildFile; fileRef = 046086992F19238500757C95 /* KBAiWaveformView.m */; }; 0460869C2F19238500757C95 /* KBAiRecordButton.m in Sources */ = {isa = PBXBuildFile; fileRef = 046086972F19238500757C95 /* KBAiRecordButton.m */; }; 046086B32F19239B00757C95 /* AudioSessionManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 046086A22F19239B00757C95 /* AudioSessionManager.m */; }; @@ -215,13 +214,11 @@ 04A9FE0F2EB481100020DB6D /* KBHUD.m in Sources */ = {isa = PBXBuildFile; fileRef = 04FC97082EB31B14007BD342 /* KBHUD.m */; }; 04A9FE132EB4D0D20020DB6D /* KBFullAccessManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04A9FE112EB4D0D20020DB6D /* KBFullAccessManager.m */; }; 04A9FE162EB873C80020DB6D /* UIViewController+Extension.m in Sources */ = {isa = PBXBuildFile; fileRef = 04A9FE152EB873C80020DB6D /* UIViewController+Extension.m */; }; - 04A9FE1A2EB892460020DB6D /* KBLocalizationManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04A9FE192EB892460020DB6D /* KBLocalizationManager.m */; }; - 04A9FE1B2EB892460020DB6D /* KBLocalizationManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04A9FE192EB892460020DB6D /* KBLocalizationManager.m */; }; - 04A9FE202EB893F10020DB6D /* Localizable.strings in Resources */ = {isa = PBXBuildFile; fileRef = 04A9FE1E2EB893F10020DB6D /* Localizable.strings */; }; - 04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */ = {isa = PBXBuildFile; fileRef = 04A9FE1E2EB893F10020DB6D /* Localizable.strings */; }; - E0A100102F60000100ABCDEF /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = E0A100002F60000100ABCDEF /* InfoPlist.strings */; }; - E0A100112F60000100ABCDEF /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = E0A100002F60000100ABCDEF /* InfoPlist.strings */; }; - 04B5A1A22EEFA12300AAAAAA /* KBPayProductModel.m in Sources */ = {isa = PBXBuildFile; fileRef = 04B5A1A12EEFA12300AAAAAA /* KBPayProductModel.m */; }; + 04A9FE1A2EB892460020DB6D /* KBLocalizationManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04A9FE192EB892460020DB6D /* KBLocalizationManager.m */; }; + 04A9FE1B2EB892460020DB6D /* KBLocalizationManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04A9FE192EB892460020DB6D /* KBLocalizationManager.m */; }; + 04A9FE202EB893F10020DB6D /* Localizable.strings in Resources */ = {isa = PBXBuildFile; fileRef = 04A9FE1E2EB893F10020DB6D /* Localizable.strings */; }; + 04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */ = {isa = PBXBuildFile; fileRef = 04A9FE1E2EB893F10020DB6D /* Localizable.strings */; }; + 04B5A1A22EEFA12300AAAAAA /* KBPayProductModel.m in Sources */ = {isa = PBXBuildFile; fileRef = 04B5A1A12EEFA12300AAAAAA /* KBPayProductModel.m */; }; 04BBF89D2F3ACD8800B1FBB2 /* KBKeyboardStressTestVC.m in Sources */ = {isa = PBXBuildFile; fileRef = 04BBF89A2F3ACD8800B1FBB2 /* KBKeyboardStressTestVC.m */; }; 04BBF89E2F3ACD8800B1FBB2 /* KBTestVC.m in Sources */ = {isa = PBXBuildFile; fileRef = 04BBF89C2F3ACD8800B1FBB2 /* KBTestVC.m */; }; 04BBF9002F3C97CB00B1FBB2 /* DeepgramWebSocketClient.m in Sources */ = {isa = PBXBuildFile; fileRef = 04BBF8FF2F3C97CB00B1FBB2 /* DeepgramWebSocketClient.m */; }; @@ -237,8 +234,6 @@ 04D1F6B22EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; }; 04D1F6B32EDFF10A00B12345 /* KBSkinInstallBridge.m in Sources */ = {isa = PBXBuildFile; fileRef = 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */; }; 04E0383E2F1A7C30002CA5A0 /* KBCustomTabBar.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */; }; - 04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038D72F20BFFB002CA5A0 /* websocket-api.md */; }; - 04E038E32F20E500002CA5A0 /* deepgramAPI.md in Resources */ = {isa = PBXBuildFile; fileRef = 04E038E22F20E500002CA5A0 /* deepgramAPI.md */; }; 04E038E92F20E877002CA5A0 /* DeepgramStreamingManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038E52F20E877002CA5A0 /* DeepgramStreamingManager.m */; }; 04E038EF2F21F0EC002CA5A0 /* AiVM.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E038EE2F21F0EC002CA5A0 /* AiVM.m */; }; 04E0394B2F236E75002CA5A0 /* KBChatUserMessageCell.m in Sources */ = {isa = PBXBuildFile; fileRef = 04E0394A2F236E75002CA5A0 /* KBChatUserMessageCell.m */; }; @@ -339,6 +334,8 @@ B7F1A1E22F90000100000001 /* indonesian_words.json in Resources */ = {isa = PBXBuildFile; fileRef = B7F1A1E42F90000100000001 /* indonesian_words.json */; }; B7F1A1E52F90000100000001 /* english_words.json in Resources */ = {isa = PBXBuildFile; fileRef = B7F1A1E62F90000100000001 /* english_words.json */; }; B7F1A1F32FA0000100000001 /* kb_diacritics_map.json in Resources */ = {isa = PBXBuildFile; fileRef = B7F1A1F22FA0000100000001 /* kb_diacritics_map.json */; }; + E0A100102F60000100ABCDEF /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = E0A100002F60000100ABCDEF /* InfoPlist.strings */; }; + E0A100112F60000100ABCDEF /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = E0A100002F60000100ABCDEF /* InfoPlist.strings */; }; EB72B60040437E3C0A4890FC /* KBShopThemeDetailModel.m in Sources */ = {isa = PBXBuildFile; fileRef = B9F60894E529C3EDAF6BAC3D /* KBShopThemeDetailModel.m */; }; ECC9EE02174D86E8D792472F /* Pods_keyBoard.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 967065BB5230E43F293B3AF9 /* Pods_keyBoard.framework */; }; /* End PBXBuildFile section */ @@ -459,7 +456,6 @@ 045ED5262F53F4AF00131114 /* KBInputProfileManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBInputProfileManager.m; sourceTree = ""; }; 045ED5292F540FBE00131114 /* normal_hei_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_hei_them.zip; sourceTree = ""; }; 045ED52A2F540FBE00131114 /* normal_them.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = normal_them.zip; sourceTree = ""; }; - 046086742F191CC700757C95 /* AI技术分析.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "AI技术分析.txt"; sourceTree = ""; }; 046086962F19238500757C95 /* KBAiRecordButton.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBAiRecordButton.h; sourceTree = ""; }; 046086972F19238500757C95 /* KBAiRecordButton.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBAiRecordButton.m; sourceTree = ""; }; 046086982F19238500757C95 /* KBAiWaveformView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBAiWaveformView.h; sourceTree = ""; }; @@ -551,14 +547,10 @@ 04837AE52F5848680012BDE2 /* id */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = id; path = id.lproj/LaunchScreen.strings; sourceTree = ""; }; 04837AE62F5848680012BDE2 /* id */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = id; path = id.lproj/Main.strings; sourceTree = ""; }; 04837AE72F5848680012BDE2 /* id */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = id; path = id.lproj/Localizable.strings; sourceTree = ""; }; - 04837AE82F5848820012BDE2 /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/LaunchScreen.strings"; sourceTree = ""; }; - 04837AE92F5848820012BDE2 /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/Main.strings"; sourceTree = ""; }; - 04837AEA2F5848820012BDE2 /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/Localizable.strings"; sourceTree = ""; }; - E0A100022F60000100ABCDEF /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/InfoPlist.strings"; sourceTree = ""; }; - E0A100032F60000100ABCDEF /* es */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = es; path = es.lproj/InfoPlist.strings; sourceTree = ""; }; - E0A100042F60000100ABCDEF /* id */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = id; path = id.lproj/InfoPlist.strings; sourceTree = ""; }; - E0A100052F60000100ABCDEF /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/InfoPlist.strings"; sourceTree = ""; }; - 048908BA2EBE1FCB00FABA60 /* BaseViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = BaseViewController.h; sourceTree = ""; }; + 04837AE82F5848820012BDE2 /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/LaunchScreen.strings"; sourceTree = ""; }; + 04837AE92F5848820012BDE2 /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/Main.strings"; sourceTree = ""; }; + 04837AEA2F5848820012BDE2 /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/Localizable.strings"; sourceTree = ""; }; + 048908BA2EBE1FCB00FABA60 /* BaseViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = BaseViewController.h; sourceTree = ""; }; 048908BB2EBE1FCB00FABA60 /* BaseViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = BaseViewController.m; sourceTree = ""; }; 048908C12EBE32B800FABA60 /* KBSearchVC.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBSearchVC.h; sourceTree = ""; }; 048908C22EBE32B800FABA60 /* KBSearchVC.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBSearchVC.m; sourceTree = ""; }; @@ -729,11 +721,10 @@ 04A9FE112EB4D0D20020DB6D /* KBFullAccessManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBFullAccessManager.m; sourceTree = ""; }; 04A9FE142EB873C80020DB6D /* UIViewController+Extension.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "UIViewController+Extension.h"; sourceTree = ""; }; 04A9FE152EB873C80020DB6D /* UIViewController+Extension.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = "UIViewController+Extension.m"; sourceTree = ""; }; - 04A9FE182EB892460020DB6D /* KBLocalizationManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBLocalizationManager.h; sourceTree = ""; }; - 04A9FE192EB892460020DB6D /* KBLocalizationManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBLocalizationManager.m; sourceTree = ""; }; - 04A9FE1C2EB893F10020DB6D /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/Localizable.strings; sourceTree = ""; }; - E0A100012F60000100ABCDEF /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/InfoPlist.strings; sourceTree = ""; }; - 04B5A1A02EEFA12300AAAAAA /* KBPayProductModel.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBPayProductModel.h; sourceTree = ""; }; + 04A9FE182EB892460020DB6D /* KBLocalizationManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBLocalizationManager.h; sourceTree = ""; }; + 04A9FE192EB892460020DB6D /* KBLocalizationManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBLocalizationManager.m; sourceTree = ""; }; + 04A9FE1C2EB893F10020DB6D /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/Localizable.strings; sourceTree = ""; }; + 04B5A1A02EEFA12300AAAAAA /* KBPayProductModel.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBPayProductModel.h; sourceTree = ""; }; 04B5A1A12EEFA12300AAAAAA /* KBPayProductModel.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBPayProductModel.m; sourceTree = ""; }; 04BBF8992F3ACD8800B1FBB2 /* KBKeyboardStressTestVC.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBKeyboardStressTestVC.h; sourceTree = ""; }; 04BBF89A2F3ACD8800B1FBB2 /* KBKeyboardStressTestVC.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBKeyboardStressTestVC.m; sourceTree = ""; }; @@ -763,8 +754,6 @@ 04D1F6B12EDFF10A00B12345 /* KBSkinInstallBridge.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBSkinInstallBridge.m; sourceTree = ""; }; 04E0383C2F1A7C30002CA5A0 /* KBCustomTabBar.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KBCustomTabBar.h; sourceTree = ""; }; 04E0383D2F1A7C30002CA5A0 /* KBCustomTabBar.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBCustomTabBar.m; sourceTree = ""; }; - 04E038D72F20BFFB002CA5A0 /* websocket-api.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = "websocket-api.md"; sourceTree = ""; }; - 04E038E22F20E500002CA5A0 /* deepgramAPI.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = deepgramAPI.md; sourceTree = ""; }; 04E038E42F20E877002CA5A0 /* DeepgramStreamingManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DeepgramStreamingManager.h; sourceTree = ""; }; 04E038E52F20E877002CA5A0 /* DeepgramStreamingManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DeepgramStreamingManager.m; sourceTree = ""; }; 04E038ED2F21F0EC002CA5A0 /* AiVM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AiVM.h; sourceTree = ""; }; @@ -954,6 +943,11 @@ B7F1A1F22FA0000100000001 /* kb_diacritics_map.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = kb_diacritics_map.json; sourceTree = ""; }; B8CA018AB878499327504AAD /* Pods-CustomKeyboard.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-CustomKeyboard.debug.xcconfig"; path = "Target Support Files/Pods-CustomKeyboard/Pods-CustomKeyboard.debug.xcconfig"; sourceTree = ""; }; B9F60894E529C3EDAF6BAC3D /* KBShopThemeDetailModel.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBShopThemeDetailModel.m; sourceTree = ""; }; + E0A100012F60000100ABCDEF /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/InfoPlist.strings; sourceTree = ""; }; + E0A100022F60000100ABCDEF /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/InfoPlist.strings"; sourceTree = ""; }; + E0A100032F60000100ABCDEF /* es */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = es; path = es.lproj/InfoPlist.strings; sourceTree = ""; }; + E0A100042F60000100ABCDEF /* id */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = id; path = id.lproj/InfoPlist.strings; sourceTree = ""; }; + E0A100052F60000100ABCDEF /* pt-PT */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "pt-PT"; path = "pt-PT.lproj/InfoPlist.strings"; sourceTree = ""; }; E2A844CD2D8584596DBE6316 /* KBShopThemeTagModel.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = KBShopThemeTagModel.m; sourceTree = ""; }; F67DDBD716E4E616D8CC2C9C /* Pods-keyBoard.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-keyBoard.debug.xcconfig"; path = "Target Support Files/Pods-keyBoard/Pods-keyBoard.debug.xcconfig"; sourceTree = ""; }; /* End PBXFileReference section */ @@ -1325,9 +1319,6 @@ 046086702F191A5100757C95 /* AiTalk */ = { isa = PBXGroup; children = ( - 046086742F191CC700757C95 /* AI技术分析.txt */, - 04E038D72F20BFFB002CA5A0 /* websocket-api.md */, - 04E038E22F20E500002CA5A0 /* deepgramAPI.md */, 0460866C2F191A5100757C95 /* M */, 0460866D2F191A5100757C95 /* V */, 0460866E2F191A5100757C95 /* VC */, @@ -1697,15 +1688,15 @@ path = Manager; sourceTree = ""; }; - 04A9FE1F2EB893F10020DB6D /* Localization */ = { - isa = PBXGroup; - children = ( - 04A9FE1E2EB893F10020DB6D /* Localizable.strings */, - E0A100002F60000100ABCDEF /* InfoPlist.strings */, - ); - path = Localization; - sourceTree = ""; - }; + 04A9FE1F2EB893F10020DB6D /* Localization */ = { + isa = PBXGroup; + children = ( + 04A9FE1E2EB893F10020DB6D /* Localizable.strings */, + E0A100002F60000100ABCDEF /* InfoPlist.strings */, + ); + path = Localization; + sourceTree = ""; + }; 04C6EAB92EAF86530089C901 /* keyBoard */ = { isa = PBXGroup; children = ( @@ -2399,10 +2390,10 @@ 04C6EAC42EAF87020089C901 /* Resources */ = { isa = PBXResourcesBuildPhase; buildActionMask = 2147483647; - files = ( - 04A9FE202EB893F10020DB6D /* Localizable.strings in Resources */, - E0A100102F60000100ABCDEF /* InfoPlist.strings in Resources */, - 041007D22ECE012000D203BB /* KBSkinIconMap.strings in Resources */, + files = ( + 04A9FE202EB893F10020DB6D /* Localizable.strings in Resources */, + E0A100102F60000100ABCDEF /* InfoPlist.strings in Resources */, + 041007D22ECE012000D203BB /* KBSkinIconMap.strings in Resources */, 04E2277F2F516ED3001A8F14 /* PrivacyInfo.xcprivacy in Resources */, A1B2C3ED2F20000000000001 /* kb_words.txt in Resources */, A1B2C3F12F20000000000002 /* kb_keyboard_layout_config.json in Resources */, @@ -2431,14 +2422,12 @@ 045ED52B2F540FBE00131114 /* normal_hei_them.zip in Resources */, 045ED52C2F540FBE00131114 /* normal_them.zip in Resources */, 043213C62F56F5280065C888 /* 台湾省初始皮肤注音.zip in Resources */, - 04E038D82F20BFFB002CA5A0 /* websocket-api.md in Resources */, 0479200B2ED87CEE004E8522 /* permiss_video.mp4 in Resources */, - 04E2277D2F516EBD001A8F14 /* PrivacyInfo.xcprivacy in Resources */, - 04C6EABA2EAF86530089C901 /* Assets.xcassets in Resources */, - 04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */, - E0A100112F60000100ABCDEF /* InfoPlist.strings in Resources */, - 047920072ED86ABC004E8522 /* kb_guide_keyboard.gif in Resources */, - 046086752F191CC700757C95 /* AI技术分析.txt in Resources */, + 04E2277D2F516EBD001A8F14 /* PrivacyInfo.xcprivacy in Resources */, + 04C6EABA2EAF86530089C901 /* Assets.xcassets in Resources */, + 04A9FE212EB893F10020DB6D /* Localizable.strings in Resources */, + E0A100112F60000100ABCDEF /* InfoPlist.strings in Resources */, + 047920072ED86ABC004E8522 /* kb_guide_keyboard.gif in Resources */, 047920112ED98E7D004E8522 /* permiss_video_2.mp4 in Resources */, 04C6EABC2EAF86530089C901 /* LaunchScreen.storyboard in Resources */, 043213BD2F56A3920065C888 /* 西班牙初始皮肤.zip in Resources */, @@ -2448,7 +2437,6 @@ 04286A132ECDEBF900CE730C /* KBSkinIconMap.strings in Resources */, 04C6EABD2EAF86530089C901 /* Main.storyboard in Resources */, 046086CB2F1A092500757C95 /* comments_mock.json in Resources */, - 04E038E32F20E500002CA5A0 /* deepgramAPI.md in Resources */, 043213A92F5566EF0065C888 /* kb_input_profiles.json in Resources */, 043213C02F56C9330065C888 /* 印度尼西亚初始皮肤.zip in Resources */, ); @@ -2837,31 +2825,19 @@ /* End PBXTargetDependency section */ /* Begin PBXVariantGroup section */ - 04A9FE1E2EB893F10020DB6D /* Localizable.strings */ = { - isa = PBXVariantGroup; - children = ( - 04A9FE1C2EB893F10020DB6D /* en */, - 04837AE12F5848050012BDE2 /* zh-Hant */, - 04837AE42F58485A0012BDE2 /* es */, - 04837AE72F5848680012BDE2 /* id */, - 04837AEA2F5848820012BDE2 /* pt-PT */, - ); - name = Localizable.strings; - sourceTree = ""; - }; - E0A100002F60000100ABCDEF /* InfoPlist.strings */ = { - isa = PBXVariantGroup; - children = ( - E0A100012F60000100ABCDEF /* en */, - E0A100022F60000100ABCDEF /* zh-Hant */, - E0A100032F60000100ABCDEF /* es */, - E0A100042F60000100ABCDEF /* id */, - E0A100052F60000100ABCDEF /* pt-PT */, - ); - name = InfoPlist.strings; - sourceTree = ""; - }; - 04C6EAB12EAF86530089C901 /* LaunchScreen.storyboard */ = { + 04A9FE1E2EB893F10020DB6D /* Localizable.strings */ = { + isa = PBXVariantGroup; + children = ( + 04A9FE1C2EB893F10020DB6D /* en */, + 04837AE12F5848050012BDE2 /* zh-Hant */, + 04837AE42F58485A0012BDE2 /* es */, + 04837AE72F5848680012BDE2 /* id */, + 04837AEA2F5848820012BDE2 /* pt-PT */, + ); + name = Localizable.strings; + sourceTree = ""; + }; + 04C6EAB12EAF86530089C901 /* LaunchScreen.storyboard */ = { isa = PBXVariantGroup; children = ( 04C6EAB02EAF86530089C901 /* Base */, @@ -2885,6 +2861,18 @@ name = Main.storyboard; sourceTree = ""; }; + E0A100002F60000100ABCDEF /* InfoPlist.strings */ = { + isa = PBXVariantGroup; + children = ( + E0A100012F60000100ABCDEF /* en */, + E0A100022F60000100ABCDEF /* zh-Hant */, + E0A100032F60000100ABCDEF /* es */, + E0A100042F60000100ABCDEF /* id */, + E0A100052F60000100ABCDEF /* pt-PT */, + ); + name = InfoPlist.strings; + sourceTree = ""; + }; /* End PBXVariantGroup section */ /* Begin XCBuildConfiguration section */ @@ -2903,20 +2891,20 @@ "$(inherited)", "KB_KEYCHAIN_ACCESS_GROUP=@\\\"$(AppIdentifierPrefix)com.loveKey.nyx.shared\\\"", ); - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = CustomKeyboard/Info.plist; - INFOPLIST_KEY_CFBundleDisplayName = "Key of Love"; - INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; - INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; - INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; - INFOPLIST_KEY_NSHumanReadableCopyright = ""; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = CustomKeyboard/Info.plist; + INFOPLIST_KEY_CFBundleDisplayName = "Key of Love"; + INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; + INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; + INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; IPHONEOS_DEPLOYMENT_TARGET = 15; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", "@executable_path/../../Frameworks", ); - MARKETING_VERSION = 1.0.0; + MARKETING_VERSION = 1.0.0; PRODUCT_BUNDLE_IDENTIFIER = com.loveKey.nyx.CustomKeyboard; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; @@ -2940,20 +2928,20 @@ "$(inherited)", "KB_KEYCHAIN_ACCESS_GROUP=@\\\"$(AppIdentifierPrefix)com.loveKey.nyx.shared\\\"", ); - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_FILE = CustomKeyboard/Info.plist; - INFOPLIST_KEY_CFBundleDisplayName = "Key of Love"; - INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; - INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; - INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; - INFOPLIST_KEY_NSHumanReadableCopyright = ""; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_FILE = CustomKeyboard/Info.plist; + INFOPLIST_KEY_CFBundleDisplayName = "Key of Love"; + INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; + INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; + INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; IPHONEOS_DEPLOYMENT_TARGET = 15; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", "@executable_path/../../Frameworks", ); - MARKETING_VERSION = 1.0.0; + MARKETING_VERSION = 1.0.0; PRODUCT_BUNDLE_IDENTIFIER = com.loveKey.nyx.CustomKeyboard; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; @@ -2983,9 +2971,9 @@ INFOPLIST_FILE = keyBoard/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = "Key of Love"; INFOPLIST_KEY_CFBundleURLTypes = "{\n CFBundleURLName = \"com.loveKey.nyx.keyboard\";\n CFBundleURLSchemes = (\n kbkeyboardAppExtension\n );\n}"; - INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; - INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; - INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; + INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; + INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; INFOPLIST_KEY_UIMainStoryboardFile = Main; @@ -3034,9 +3022,9 @@ INFOPLIST_FILE = keyBoard/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = "Key of Love"; INFOPLIST_KEY_CFBundleURLTypes = "{\n CFBundleURLName = \"com.loveKey.nyx.keyboard\";\n CFBundleURLSchemes = (\n kbkeyboardAppExtension\n );\n}"; - INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; - INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; - INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "Microphone access is required for voice input."; + INFOPLIST_KEY_NSPhotoLibraryAddUsageDescription = "Photo library write access is required to save images."; + INFOPLIST_KEY_NSPhotoLibraryUsageDescription = "Photo library access is required to change your avatar."; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; INFOPLIST_KEY_UIMainStoryboardFile = Main; @@ -3212,7 +3200,6 @@ defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - }; rootObject = 727EC74B2EAF848B00B36487 /* Project object */; } diff --git a/keyBoard/Class/AiTalk/AI技术分析.txt b/keyBoard/Class/AiTalk/AI技术分析.txt deleted file mode 100644 index 28e4757..0000000 --- a/keyBoard/Class/AiTalk/AI技术分析.txt +++ /dev/null @@ -1,521 +0,0 @@ - 服务 用途 示例格式 -ASR 服务器 语音识别(WebSocket) wss://api.example.com/asr -LLM 服务器 AI 对话(HTTP SSE) https://api.example.com/chat -TTS 服务器 语音合成 https://api.example.com/tts - -iOS(Objective-C,iOS 15+)端技术实现文档 -低延迟流式语音陪伴聊天(按住说话,类似猫箱首页) -0. 范围与目标 - -实现首页语音陪伴对话: - -按住说话:开始录音并实时流式发送到 ASR - -松开结束:ASR 立即 finalize,返回最终文本并显示 - -AI 回复:边显示文字(打字机效果)边播放服务端 TTS 音频 - -延迟低优先:不等待完整回答/完整音频,采用“分句触发 + 流式/准流式播放” - -打断(Barge-in):AI 正在播报时用户再次按住 → 立即停止播报/取消请求,进入新一轮录音 - -iOS 最低版本:iOS 15 - -1. 总体架构(客户端模块) -KBAiMainVC - └─ ConversationOrchestrator (核心状态机 / 串联模块 / 取消与打断) - ├─ AudioSessionManager (AVAudioSession 配置与中断处理) - ├─ AudioCaptureManager (AVAudioEngine input tap -> 20ms PCM frames) - ├─ ASRStreamClient (NSURLSessionWebSocketTask 流式识别) - ├─ LLMStreamClient (SSE/WS token stream) - ├─ Segmenter (句子切分:够一句就触发 TTS) - ├─ TTSServiceClient (请求 TTS,适配多种返回形态) - ├─ TTSPlaybackPipeline (可插拔:URL播放器 / AAC解码 / PCM直喂) - ├─ AudioStreamPlayer (AVAudioEngine + AVAudioPlayerNode 播 PCM) - └─ SubtitleSync (按播放进度映射文字进度) - -2. 音频会话(AVAudioSession)与权限 -2.1 麦克风权限 - -仅在用户第一次按住说话前请求 - -若用户拒绝:提示到设置开启 - -2.2 AudioSession 配置(对话模式) - -Objective-C(建议参数): - -category:AVAudioSessionCategoryPlayAndRecord - -mode:AVAudioSessionModeVoiceChat - -options: - -AVAudioSessionCategoryOptionDefaultToSpeaker - -AVAudioSessionCategoryOptionAllowBluetooth - -(可选)AVAudioSessionCategoryOptionMixWithOthers:若你希望不打断宿主音频(看产品) - -2.3 中断与路由变化处理(必须) - -监听: - -AVAudioSessionInterruptionNotification - -AVAudioSessionRouteChangeNotification - -处理原则: - -来电/中断开始:停止采集 + 停止播放 + cancel 网络会话 - -中断结束:回到 Idle,等待用户重新按住 - -3. 音频采集(按住期间流式上传) -3.1 固定音频参数(锁死,便于端到端稳定) - -Sample Rate:16000 Hz - -Channels:1 - -Format:PCM Int16(pcm_s16le) - -Frame Duration:20ms - -16kHz * 0.02s = 320 samples - -每帧 bytes = 320 * 2 = 640 bytes - -3.2 AudioCaptureManager(AVAudioEngine 输入 tap) - -使用: - -AVAudioEngine - -inputNode installTapOnBus:bufferSize:format:block: - -关键点: - -tap 回调线程不可做重活:只做拷贝 + dispatch 到 audioQueue - -将 AVAudioPCMBuffer 转成 Int16 PCM NSData - -确保稳定输出“20ms帧”,如果 tap 回调 buffer 不刚好是 20ms,需要做 帧拼接/切片(ring buffer) - -3.3 接口定义(OC) -@protocol AudioCaptureManagerDelegate -- (void)audioCaptureManagerDidOutputPCMFrame:(NSData *)pcmFrame; // 20ms/640B -- (void)audioCaptureManagerDidUpdateRMS:(float)rms; // 可选:UI波形 -@end - -@interface AudioCaptureManager : NSObject -@property (nonatomic, weak) id delegate; -- (BOOL)startCapture:(NSError **)error; -- (void)stopCapture; -@end - -4. ASR 流式识别(iOS15:NSURLSessionWebSocketTask) -4.1 建议协议(控制帧 JSON + 音频帧二进制) - -Start(文本帧) - -{ - "type":"start", - "sessionId":"uuid", - "format":"pcm_s16le", - "sampleRate":16000, - "channels":1 -} - - -Audio(二进制帧) - -直接发送 640B/帧 PCM - -频率:50fps(每秒 50 帧) - -Finalize(文本帧) - -{ "type":"finalize", "sessionId":"uuid" } - -4.2 下行事件 -{ "type":"partial", "text":"今天" } -{ "type":"final", "text":"今天天气怎么样" } -{ "type":"error", "code":123, "message":"..." } - -4.3 ASRStreamClient 接口(OC) -@protocol ASRStreamClientDelegate -- (void)asrClientDidReceivePartialText:(NSString *)text; -- (void)asrClientDidReceiveFinalText:(NSString *)text; -- (void)asrClientDidFail:(NSError *)error; -@end - -@interface ASRStreamClient : NSObject -@property (nonatomic, weak) id delegate; -- (void)startWithSessionId:(NSString *)sessionId; -- (void)sendAudioPCMFrame:(NSData *)pcmFrame; // 20ms frame -- (void)finalize; -- (void)cancel; -@end - -5. LLM 流式生成(token stream) -5.1 目标 - -低延迟:不要等整段回答 - -使用 SSE 或 WS 收 token - -token 进入 Segmenter,够一句就触发 TTS - -5.2 LLMStreamClient 接口(OC) -@protocol LLMStreamClientDelegate -- (void)llmClientDidReceiveToken:(NSString *)token; -- (void)llmClientDidComplete; -- (void)llmClientDidFail:(NSError *)error; -@end - -@interface LLMStreamClient : NSObject -@property (nonatomic, weak) id delegate; -- (void)sendUserText:(NSString *)text conversationId:(NSString *)cid; -- (void)cancel; -@end - -6. Segmenter(句子切分:先播第一句) -6.1 切分规则(推荐) - -任一满足则切分成 segment: - -遇到 。!?\n 之一 - -或累积字符数 ≥ 30(可配置) - -6.2 Segmenter 接口(OC) -@interface Segmenter : NSObject -- (void)appendToken:(NSString *)token; -- (NSArray *)popReadySegments; // 返回立即可TTS的片段数组 -- (void)reset; -@end - -7. TTS:返回形态未定 → 客户端做“可插拔播放管线” - -由于服务端同事未定输出格式,客户端必须支持以下 四种 TTS 输出模式 的任意一种: - -模式 A:返回 m4a/MP3 URL(最容易落地) - -服务端返回 URL(或 base64 文件) - -客户端用 AVPlayer / AVAudioPlayer 播放 - -字幕同步用“音频时长映射”(可拿到 duration) - -优点:服务端简单 -缺点:首帧延迟通常更高(要等整段生成、至少等首包) - -模式 B:返回 AAC chunk(流式) - -服务端 WS 推 AAC 帧 - -客户端需要 AAC 解码成 PCM,再喂 AudioStreamPlayer - -模式 C:返回 Opus chunk(流式) - -需 Opus 解码库(服务端/客户端成本更高) - -解码后喂 PCM 播放 - -模式 D:返回 PCM chunk(最适合低延迟) - -服务端直接推 PCM16 chunk(比如 100ms 一块) - -客户端直接转 AVAudioPCMBuffer schedule - -延迟最低、实现最稳 - -8. TTSServiceClient(统一网络层接口) -8.1 统一回调事件(抽象) -typedef NS_ENUM(NSInteger, TTSPayloadType) { - TTSPayloadTypeURL, // A - TTSPayloadTypePCMChunk, // D - TTSPayloadTypeAACChunk, // B - TTSPayloadTypeOpusChunk // C -}; - -@protocol TTSServiceClientDelegate -- (void)ttsClientDidReceiveURL:(NSURL *)url segmentId:(NSString *)segmentId; -- (void)ttsClientDidReceiveAudioChunk:(NSData *)chunk - payloadType:(TTSPayloadType)type - segmentId:(NSString *)segmentId; -- (void)ttsClientDidFinishSegment:(NSString *)segmentId; -- (void)ttsClientDidFail:(NSError *)error; -@end - -@interface TTSServiceClient : NSObject -@property (nonatomic, weak) id delegate; -- (void)requestTTSForText:(NSString *)text segmentId:(NSString *)segmentId; -- (void)cancel; -@end - - -这样服务端最后选哪种输出,你只需实现对应分支即可,不需要推翻客户端架构。 - -9. TTSPlaybackPipeline(播放管线:根据 payloadType 路由) -9.1 设计目标 - -支持 URL 播放与流式 chunk 播放 - -提供统一的“开始播放/停止/进度”接口供字幕同步与打断使用 - -9.2 Pipeline 结构(建议) - -TTSPlaybackPipeline 只做路由与队列管理 - -URL → TTSURLPlayer(AVPlayer) - -PCM → AudioStreamPlayer(AVAudioEngine) - -AAC/Opus → Decoder → PCM → AudioStreamPlayer - -9.3 Pipeline 接口(OC) -@protocol TTSPlaybackPipelineDelegate -- (void)pipelineDidStartSegment:(NSString *)segmentId duration:(NSTimeInterval)duration; -- (void)pipelineDidUpdatePlaybackTime:(NSTimeInterval)time segmentId:(NSString *)segmentId; -- (void)pipelineDidFinishSegment:(NSString *)segmentId; -@end - -@interface TTSPlaybackPipeline : NSObject -@property (nonatomic, weak) id delegate; - -- (BOOL)start:(NSError **)error; // 启动音频引擎等 -- (void)stop; // 立即停止(打断) - -- (void)enqueueURL:(NSURL *)url segmentId:(NSString *)segmentId; -- (void)enqueueChunk:(NSData *)chunk payloadType:(TTSPayloadType)type segmentId:(NSString *)segmentId; - -// 可选:用于字幕同步 -- (NSTimeInterval)currentTimeForSegment:(NSString *)segmentId; -- (NSTimeInterval)durationForSegment:(NSString *)segmentId; -@end - -10. AudioStreamPlayer(PCM 流式播放,低延迟核心) -10.1 使用 AVAudioEngine + AVAudioPlayerNode - -将 PCM chunk 转 AVAudioPCMBuffer - -scheduleBuffer 播放 - -维护“当前 segment 的播放时间/总时长”(可估算或累加 chunk 时长) - -10.2 接口(OC) -@interface AudioStreamPlayer : NSObject -- (BOOL)start:(NSError **)error; -- (void)stop; -- (void)enqueuePCMChunk:(NSData *)pcmData - sampleRate:(double)sampleRate - channels:(int)channels - segmentId:(NSString *)segmentId; - -- (NSTimeInterval)playbackTimeForSegment:(NSString *)segmentId; -- (NSTimeInterval)durationForSegment:(NSString *)segmentId; -@end - - -PCM chunk 的粒度建议:50ms~200ms(太小 schedule 太频繁,太大延迟高)。 - -11. 字幕同步(延迟优先) -11.1 策略 - -对每个 segment 的文本 text,按播放进度映射显示字符数: - -visibleCount = round(text.length * (t / T)) - -t:segment 当前播放进度(pipeline 提供) - -T:segment 总时长(URL 模式直接取;chunk 模式可累加估算) - -11.2 SubtitleSync 接口(OC) -@interface SubtitleSync : NSObject -- (NSString *)visibleTextForFullText:(NSString *)fullText - currentTime:(NSTimeInterval)t - duration:(NSTimeInterval)T; -@end - -12. ConversationOrchestrator(状态机 + 打断 + 队列) -12.1 状态 -typedef NS_ENUM(NSInteger, ConversationState) { - ConversationStateIdle, - ConversationStateListening, - ConversationStateRecognizing, - ConversationStateThinking, - ConversationStateSpeaking -}; - -12.2 关键流程 -事件:用户按住(userDidPressRecord) - -如果正在 Speaking/Thinking: - -[ttsService cancel] - -[llmClient cancel] - -[asrClient cancel](如仍在识别) - -[pipeline stop](立即停播) - -清空 segment 队列、字幕队列 - -配置/激活 AudioSession - -新建 sessionId - -[asrClient startWithSessionId:] - -[audioCapture startCapture:] - -state = Listening - -事件:用户松开(userDidReleaseRecord) - -[audioCapture stopCapture] - -[asrClient finalize] - -state = Recognizing - -回调:ASR final text - -UI 显示用户最终文本 - -state = Thinking - -开始 LLM stream:[llmClient sendUserText:conversationId:] - -回调:LLM token - -segmenter appendToken - -segments = [segmenter popReadySegments] - -对每个 segment: - -生成 segmentId - -记录 segmentTextMap[segmentId] = segmentText - -[ttsService requestTTSForText:segmentId:] - -当收到第一个可播放音频并开始播: - -state = Speaking - -回调:TTS 音频到达 - -URL:[pipeline enqueueURL:segmentId:] - -chunk:[pipeline enqueueChunk:payloadType:segmentId:] - -回调:pipeline 播放时间更新(每 30-60fps 或定时器) - -根据当前 segmentId 取到 fullText - -visible = [subtitleSync visibleTextForFullText:currentTime:duration:] - -UI 更新 AI 可见文本 - -12.3 打断(Barge-in) - -当用户再次按住: - -立即 stop 播放 - -取消所有未完成网络请求 - -丢弃所有未播放 segments - -开始新一轮录音 - -12.4 Orchestrator 接口(OC) -@interface ConversationOrchestrator : NSObject -@property (nonatomic, assign, readonly) ConversationState state; - -- (void)userDidPressRecord; -- (void)userDidReleaseRecord; - -@property (nonatomic, copy) void (^onUserFinalText)(NSString *text); -@property (nonatomic, copy) void (^onAssistantVisibleText)(NSString *text); -@property (nonatomic, copy) void (^onError)(NSError *error); -@end - -13. 线程/队列模型(强制要求,避免竞态) - -建议三条队列 + 一条 orchestrator 串行队列: - -dispatch_queue_t audioQueue;(采集帧处理、ring buffer) - -dispatch_queue_t networkQueue;(WS 收发解析) - -dispatch_queue_t orchestratorQueue;(状态机串行,唯一修改 state/队列的地方) - -UI 更新统一回主线程 - -规则: - -任何网络/音频回调 → dispatch_async(orchestratorQueue, ^{ ... }) - -Orchestrator 内部再决定是否发 UI 回调(主线程) - -14. 关键参数(延迟与稳定性) - -音频帧:20ms - -PCM:16k/mono/int16 - -ASR 上传:WS 二进制 - -LLM:token stream - -TTS:优先 chunk;若 URL 模式也要尽快开始下载与播放 - -chunk 播放缓冲:100~200ms(防抖动) - -15. 开发落地建议(服务端未定情况下的迭代路径) -Phase 1:先跑通端到端(用“URL 模式”模拟) - -TTSServiceClient 先假定服务端返回 m4a URL(或本地 mock URL) - -Pipeline 实现 URL 播放(AVPlayer) - -打断 + 字幕同步先跑通 - -Phase 2:服务端定了输出后再替换 - -若服务端给 PCM chunk:直接走 AudioStreamPlayer(最推荐) - -若给 AAC chunk:补 AAC 解码模块(AudioConverter 或第三方) - -若给 Opus chunk:集成 Opus 解码库,再喂 PCM - -关键:Orchestrator/Segmenter/ASR/字幕同步都不需要改,只替换 TTSPlaybackPipeline 分支。 - -16. 合规/体验注意 - -录音必须由用户动作触发(按住) - -明确的“正在录音”提示与波形 - -避免自动偷录 - -播放时允许随时打断 - -文档结束 -给“写代码的 AI”的额外要求(建议你一并附上) - -语言:Objective-C(.h/.m) - -iOS 15+,WebSocket 用 NSURLSessionWebSocketTask - -音频采集用 AVAudioEngine + ring buffer 切 20ms 帧 - -播放管线必须支持:URL 播放(AVPlayer)+ PCM chunk 播放(AVAudioEngine) - -其余 AAC/Opus 分支可留 TODO / stub,但接口要预留 diff --git a/keyBoard/Class/AiTalk/deepgramAPI.md b/keyBoard/Class/AiTalk/deepgramAPI.md deleted file mode 100644 index 208be33..0000000 --- a/keyBoard/Class/AiTalk/deepgramAPI.md +++ /dev/null @@ -1,1119 +0,0 @@ -# Pre-Recorded Audio - -POST https://api.deepgram.com/v1/listen -Content-Type: application/json - -Transcribe audio and video using Deepgram's speech-to-text REST API - -Reference: https://developers.deepgram.com/reference/speech-to-text/listen-pre-recorded - -## OpenAPI Specification - -```yaml -openapi: 3.1.1 -info: - title: Transcribe and analyze pre-recorded audio and video - version: endpoint_listen/v1/media.transcribe -paths: - /v1/listen: - post: - operationId: transcribe - summary: Transcribe and analyze pre-recorded audio and video - description: Transcribe audio and video using Deepgram's speech-to-text REST API - tags: - - - subpackage_listen - - subpackage_listen/v1 - - subpackage_listen/v1/media - parameters: - - name: callback - in: query - description: URL to which we'll make the callback request - required: false - schema: - type: string - - name: callback_method - in: query - description: HTTP method by which the callback request will be made - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersCallbackMethod' - - name: extra - in: query - description: >- - Arbitrary key-value pairs that are attached to the API response for - usage in downstream processing - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersExtra' - - name: sentiment - in: query - description: Recognizes the sentiment throughout a transcript or text - required: false - schema: - type: boolean - default: false - - name: summarize - in: query - description: >- - Summarize content. For Listen API, supports string version option. - For Read API, accepts boolean only. - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersSummarize' - - name: tag - in: query - description: >- - Label your requests for the purpose of identification during usage - reporting - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersTag' - - name: topics - in: query - description: Detect topics throughout a transcript or text - required: false - schema: - type: boolean - default: false - - name: custom_topic - in: query - description: >- - Custom topics you want the model to detect within your input audio - or text if present Submit up to `100`. - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersCustomTopic' - - name: custom_topic_mode - in: query - description: >- - Sets how the model will interpret strings submitted to the - `custom_topic` param. When `strict`, the model will only return - topics submitted using the `custom_topic` param. When `extended`, - the model will return its own detected topics in addition to those - submitted using the `custom_topic` param - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersCustomTopicMode' - - name: intents - in: query - description: Recognizes speaker intent throughout a transcript or text - required: false - schema: - type: boolean - default: false - - name: custom_intent - in: query - description: >- - Custom intents you want the model to detect within your input audio - if present - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersCustomIntent' - - name: custom_intent_mode - in: query - description: >- - Sets how the model will interpret intents submitted to the - `custom_intent` param. When `strict`, the model will only return - intents submitted using the `custom_intent` param. When `extended`, - the model will return its own detected intents in the - `custom_intent` param. - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersCustomIntentMode' - - name: detect_entities - in: query - description: Identifies and extracts key entities from content in submitted audio - required: false - schema: - type: boolean - default: false - - name: detect_language - in: query - description: Identifies the dominant language spoken in submitted audio - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersDetectLanguage' - - name: diarize - in: query - description: >- - Recognize speaker changes. Each word in the transcript will be - assigned a speaker number starting at 0 - required: false - schema: - type: boolean - default: false - - name: dictation - in: query - description: Dictation mode for controlling formatting with dictated speech - required: false - schema: - type: boolean - default: false - - name: encoding - in: query - description: Specify the expected encoding of your submitted audio - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersEncoding' - - name: filler_words - in: query - description: >- - Filler Words can help transcribe interruptions in your audio, like - "uh" and "um" - required: false - schema: - type: boolean - default: false - - name: keyterm - in: query - description: >- - Key term prompting can boost or suppress specialized terminology and - brands. Only compatible with Nova-3 - required: false - schema: - type: array - items: - type: string - - name: keywords - in: query - description: Keywords can boost or suppress specialized terminology and brands - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersKeywords' - - name: language - in: query - description: >- - The [BCP-47 language tag](https://tools.ietf.org/html/bcp47) that - hints at the primary spoken language. Depending on the Model and API - endpoint you choose only certain languages are available - required: false - schema: - type: string - default: en - - name: measurements - in: query - description: >- - Spoken measurements will be converted to their corresponding - abbreviations - required: false - schema: - type: boolean - default: false - - name: model - in: query - description: AI model used to process submitted audio - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersModel' - - name: multichannel - in: query - description: Transcribe each audio channel independently - required: false - schema: - type: boolean - default: false - - name: numerals - in: query - description: Numerals converts numbers from written format to numerical format - required: false - schema: - type: boolean - default: false - - name: paragraphs - in: query - description: Splits audio into paragraphs to improve transcript readability - required: false - schema: - type: boolean - default: false - - name: profanity_filter - in: query - description: >- - Profanity Filter looks for recognized profanity and converts it to - the nearest recognized non-profane word or removes it from the - transcript completely - required: false - schema: - type: boolean - default: false - - name: punctuate - in: query - description: Add punctuation and capitalization to the transcript - required: false - schema: - type: boolean - default: false - - name: redact - in: query - description: Redaction removes sensitive information from your transcripts - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersRedact' - - name: replace - in: query - description: Search for terms or phrases in submitted audio and replaces them - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersReplace' - - name: search - in: query - description: Search for terms or phrases in submitted audio - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersSearch' - - name: smart_format - in: query - description: >- - Apply formatting to transcript output. When set to true, additional - formatting will be applied to transcripts to improve readability - required: false - schema: - type: boolean - default: false - - name: utterances - in: query - description: Segments speech into meaningful semantic units - required: false - schema: - type: boolean - default: false - - name: utt_split - in: query - description: >- - Seconds to wait before detecting a pause between words in submitted - audio - required: false - schema: - type: number - format: double - default: 0.8 - - name: version - in: query - description: Version of an AI model to use - required: false - schema: - $ref: '#/components/schemas/V1ListenPostParametersVersion' - - name: mip_opt_out - in: query - description: >- - Opts out requests from the Deepgram Model Improvement Program. Refer - to our Docs for pricing impacts before setting this to true. - https://dpgr.am/deepgram-mip - required: false - schema: - type: boolean - default: false - - name: Authorization - in: header - description: Header authentication of the form `undefined ` - required: true - schema: - type: string - responses: - '200': - description: >- - Returns either transcription results, or a request_id when using a - callback. - content: - application/json: - schema: - $ref: '#/components/schemas/listen_v1_media_transcribe_Response_200' - '400': - description: Invalid Request - content: {} - requestBody: - description: Transcribe an audio or video file - content: - application/json: - schema: - $ref: '#/components/schemas/ListenV1RequestUrl' -components: - schemas: - V1ListenPostParametersCallbackMethod: - type: string - enum: - - value: POST - - value: PUT - default: POST - V1ListenPostParametersExtra: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersSummarize0: - type: string - enum: - - value: v2 - V1ListenPostParametersSummarize: - oneOf: - - $ref: '#/components/schemas/V1ListenPostParametersSummarize0' - - type: boolean - default: false - V1ListenPostParametersTag: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersCustomTopic: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersCustomTopicMode: - type: string - enum: - - value: extended - - value: strict - default: extended - V1ListenPostParametersCustomIntent: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersCustomIntentMode: - type: string - enum: - - value: extended - - value: strict - default: extended - V1ListenPostParametersDetectLanguage: - oneOf: - - type: boolean - default: false - - type: array - items: - type: string - V1ListenPostParametersEncoding: - type: string - enum: - - value: linear16 - - value: flac - - value: mulaw - - value: amr-nb - - value: amr-wb - - value: opus - - value: speex - - value: g729 - V1ListenPostParametersKeywords: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersModel0: - type: string - enum: - - value: nova-3 - - value: nova-3-general - - value: nova-3-medical - - value: nova-2 - - value: nova-2-general - - value: nova-2-meeting - - value: nova-2-finance - - value: nova-2-conversationalai - - value: nova-2-voicemail - - value: nova-2-video - - value: nova-2-medical - - value: nova-2-drivethru - - value: nova-2-automotive - - value: nova - - value: nova-general - - value: nova-phonecall - - value: nova-medical - - value: enhanced - - value: enhanced-general - - value: enhanced-meeting - - value: enhanced-phonecall - - value: enhanced-finance - - value: base - - value: meeting - - value: phonecall - - value: finance - - value: conversationalai - - value: voicemail - - value: video - V1ListenPostParametersModel: - oneOf: - - $ref: '#/components/schemas/V1ListenPostParametersModel0' - - type: string - V1ListenPostParametersRedactSchemaOneOf1Items: - type: string - enum: - - value: pci - - value: pii - - value: numbers - V1ListenPostParametersRedact1: - type: array - items: - $ref: '#/components/schemas/V1ListenPostParametersRedactSchemaOneOf1Items' - V1ListenPostParametersRedact: - oneOf: - - type: string - - $ref: '#/components/schemas/V1ListenPostParametersRedact1' - V1ListenPostParametersReplace: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersSearch: - oneOf: - - type: string - - type: array - items: - type: string - V1ListenPostParametersVersion0: - type: string - enum: - - value: latest - V1ListenPostParametersVersion: - oneOf: - - $ref: '#/components/schemas/V1ListenPostParametersVersion0' - - type: string - ListenV1RequestUrl: - type: object - properties: - url: - type: string - format: uri - required: - - url - ListenV1ResponseMetadataModelInfo: - type: object - properties: {} - ListenV1ResponseMetadataSummaryInfo: - type: object - properties: - model_uuid: - type: string - input_tokens: - type: number - format: double - output_tokens: - type: number - format: double - ListenV1ResponseMetadataSentimentInfo: - type: object - properties: - model_uuid: - type: string - input_tokens: - type: number - format: double - output_tokens: - type: number - format: double - ListenV1ResponseMetadataTopicsInfo: - type: object - properties: - model_uuid: - type: string - input_tokens: - type: number - format: double - output_tokens: - type: number - format: double - ListenV1ResponseMetadataIntentsInfo: - type: object - properties: - model_uuid: - type: string - input_tokens: - type: number - format: double - output_tokens: - type: number - format: double - ListenV1ResponseMetadata: - type: object - properties: - transaction_key: - type: string - default: deprecated - request_id: - type: string - format: uuid - sha256: - type: string - created: - type: string - format: date-time - duration: - type: number - format: double - channels: - type: number - format: double - models: - type: array - items: - type: string - model_info: - $ref: '#/components/schemas/ListenV1ResponseMetadataModelInfo' - summary_info: - $ref: '#/components/schemas/ListenV1ResponseMetadataSummaryInfo' - sentiment_info: - $ref: '#/components/schemas/ListenV1ResponseMetadataSentimentInfo' - topics_info: - $ref: '#/components/schemas/ListenV1ResponseMetadataTopicsInfo' - intents_info: - $ref: '#/components/schemas/ListenV1ResponseMetadataIntentsInfo' - tags: - type: array - items: - type: string - required: - - request_id - - sha256 - - created - - duration - - channels - - models - - model_info - ListenV1ResponseResultsChannelsItemsSearchItemsHitsItems: - type: object - properties: - confidence: - type: number - format: double - start: - type: number - format: double - end: - type: number - format: double - snippet: - type: string - ListenV1ResponseResultsChannelsItemsSearchItems: - type: object - properties: - query: - type: string - hits: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsSearchItemsHitsItems - ListenV1ResponseResultsChannelsItemsAlternativesItemsWordsItems: - type: object - properties: - word: - type: string - start: - type: number - format: double - end: - type: number - format: double - confidence: - type: number - format: double - ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItemsSentencesItems: - type: object - properties: - text: - type: string - start: - type: number - format: double - end: - type: number - format: double - ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItems: - type: object - properties: - sentences: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItemsSentencesItems - speaker: - type: number - format: double - num_words: - type: number - format: double - start: - type: number - format: double - end: - type: number - format: double - ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphs: - type: object - properties: - transcript: - type: string - paragraphs: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphsParagraphsItems - ListenV1ResponseResultsChannelsItemsAlternativesItemsEntitiesItems: - type: object - properties: - label: - type: string - value: - type: string - raw_value: - type: string - confidence: - type: number - format: double - start_word: - type: number - format: double - end_word: - type: number - format: double - ListenV1ResponseResultsChannelsItemsAlternativesItemsSummariesItems: - type: object - properties: - summary: - type: string - start_word: - type: number - format: double - end_word: - type: number - format: double - ListenV1ResponseResultsChannelsItemsAlternativesItemsTopicsItems: - type: object - properties: - text: - type: string - start_word: - type: number - format: double - end_word: - type: number - format: double - topics: - type: array - items: - type: string - ListenV1ResponseResultsChannelsItemsAlternativesItems: - type: object - properties: - transcript: - type: string - confidence: - type: number - format: double - words: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsWordsItems - paragraphs: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsParagraphs - entities: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsEntitiesItems - summaries: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsSummariesItems - topics: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItemsTopicsItems - ListenV1ResponseResultsChannelsItems: - type: object - properties: - search: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsSearchItems - alternatives: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsChannelsItemsAlternativesItems - detected_language: - type: string - ListenV1ResponseResultsChannels: - type: array - items: - $ref: '#/components/schemas/ListenV1ResponseResultsChannelsItems' - ListenV1ResponseResultsUtterancesItemsWordsItems: - type: object - properties: - word: - type: string - start: - type: number - format: double - end: - type: number - format: double - confidence: - type: number - format: double - speaker: - type: number - format: double - speaker_confidence: - type: number - format: double - punctuated_word: - type: string - ListenV1ResponseResultsUtterancesItems: - type: object - properties: - start: - type: number - format: double - end: - type: number - format: double - confidence: - type: number - format: double - channel: - type: number - format: double - transcript: - type: string - words: - type: array - items: - $ref: >- - #/components/schemas/ListenV1ResponseResultsUtterancesItemsWordsItems - speaker: - type: number - format: double - id: - type: string - format: uuid - ListenV1ResponseResultsUtterances: - type: array - items: - $ref: '#/components/schemas/ListenV1ResponseResultsUtterancesItems' - ListenV1ResponseResultsSummary: - type: object - properties: - result: - type: string - short: - type: string - SharedTopicsResultsTopicsSegmentsItemsTopicsItems: - type: object - properties: - topic: - type: string - confidence_score: - type: number - format: double - SharedTopicsResultsTopicsSegmentsItems: - type: object - properties: - text: - type: string - start_word: - type: number - format: double - end_word: - type: number - format: double - topics: - type: array - items: - $ref: >- - #/components/schemas/SharedTopicsResultsTopicsSegmentsItemsTopicsItems - SharedTopicsResultsTopics: - type: object - properties: - segments: - type: array - items: - $ref: '#/components/schemas/SharedTopicsResultsTopicsSegmentsItems' - SharedTopicsResults: - type: object - properties: - topics: - $ref: '#/components/schemas/SharedTopicsResultsTopics' - SharedTopics: - type: object - properties: - results: - $ref: '#/components/schemas/SharedTopicsResults' - SharedIntentsResultsIntentsSegmentsItemsIntentsItems: - type: object - properties: - intent: - type: string - confidence_score: - type: number - format: double - SharedIntentsResultsIntentsSegmentsItems: - type: object - properties: - text: - type: string - start_word: - type: number - format: double - end_word: - type: number - format: double - intents: - type: array - items: - $ref: >- - #/components/schemas/SharedIntentsResultsIntentsSegmentsItemsIntentsItems - SharedIntentsResultsIntents: - type: object - properties: - segments: - type: array - items: - $ref: '#/components/schemas/SharedIntentsResultsIntentsSegmentsItems' - SharedIntentsResults: - type: object - properties: - intents: - $ref: '#/components/schemas/SharedIntentsResultsIntents' - SharedIntents: - type: object - properties: - results: - $ref: '#/components/schemas/SharedIntentsResults' - SharedSentimentsSegmentsItems: - type: object - properties: - text: - type: string - start_word: - type: number - format: double - end_word: - type: number - format: double - sentiment: - type: string - sentiment_score: - type: number - format: double - SharedSentimentsAverage: - type: object - properties: - sentiment: - type: string - sentiment_score: - type: number - format: double - SharedSentiments: - type: object - properties: - segments: - type: array - items: - $ref: '#/components/schemas/SharedSentimentsSegmentsItems' - average: - $ref: '#/components/schemas/SharedSentimentsAverage' - ListenV1ResponseResults: - type: object - properties: - channels: - $ref: '#/components/schemas/ListenV1ResponseResultsChannels' - utterances: - $ref: '#/components/schemas/ListenV1ResponseResultsUtterances' - summary: - $ref: '#/components/schemas/ListenV1ResponseResultsSummary' - topics: - $ref: '#/components/schemas/SharedTopics' - intents: - $ref: '#/components/schemas/SharedIntents' - sentiments: - $ref: '#/components/schemas/SharedSentiments' - required: - - channels - ListenV1Response: - type: object - properties: - metadata: - $ref: '#/components/schemas/ListenV1ResponseMetadata' - results: - $ref: '#/components/schemas/ListenV1ResponseResults' - required: - - metadata - - results - ListenV1AcceptedResponse: - type: object - properties: - request_id: - type: string - format: uuid - description: Unique identifier for tracking the asynchronous request - required: - - request_id - listen_v1_media_transcribe_Response_200: - oneOf: - - $ref: '#/components/schemas/ListenV1Response' - - $ref: '#/components/schemas/ListenV1AcceptedResponse' - -``` - -## SDK Code Examples - -```python -import requests - -url = "https://api.deepgram.com/v1/listen" - -payload = { "url": "https://dpgr.am/spacewalk.wav" } -headers = { - "Authorization": "", - "Content-Type": "application/json" -} - -response = requests.post(url, json=payload, headers=headers) - -print(response.json()) -``` - -```javascript -const url = 'https://api.deepgram.com/v1/listen'; -const options = { - method: 'POST', - headers: {Authorization: '', 'Content-Type': 'application/json'}, - body: '{"url":"https://dpgr.am/spacewalk.wav"}' -}; - -try { - const response = await fetch(url, options); - const data = await response.json(); - console.log(data); -} catch (error) { - console.error(error); -} -``` - -```go -package main - -import ( - "fmt" - "strings" - "net/http" - "io" -) - -func main() { - - url := "https://api.deepgram.com/v1/listen" - - payload := strings.NewReader("{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}") - - req, _ := http.NewRequest("POST", url, payload) - - req.Header.Add("Authorization", "") - req.Header.Add("Content-Type", "application/json") - - res, _ := http.DefaultClient.Do(req) - - defer res.Body.Close() - body, _ := io.ReadAll(res.Body) - - fmt.Println(res) - fmt.Println(string(body)) - -} -``` - -```ruby -require 'uri' -require 'net/http' - -url = URI("https://api.deepgram.com/v1/listen") - -http = Net::HTTP.new(url.host, url.port) -http.use_ssl = true - -request = Net::HTTP::Post.new(url) -request["Authorization"] = '' -request["Content-Type"] = 'application/json' -request.body = "{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}" - -response = http.request(request) -puts response.read_body -``` - -```java -HttpResponse response = Unirest.post("https://api.deepgram.com/v1/listen") - .header("Authorization", "") - .header("Content-Type", "application/json") - .body("{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}") - .asString(); -``` - -```php -request('POST', 'https://api.deepgram.com/v1/listen', [ - 'body' => '{ - "url": "https://dpgr.am/spacewalk.wav" -}', - 'headers' => [ - 'Authorization' => '', - 'Content-Type' => 'application/json', - ], -]); - -echo $response->getBody(); -``` - -```csharp -var client = new RestClient("https://api.deepgram.com/v1/listen"); -var request = new RestRequest(Method.POST); -request.AddHeader("Authorization", ""); -request.AddHeader("Content-Type", "application/json"); -request.AddParameter("application/json", "{\n \"url\": \"https://dpgr.am/spacewalk.wav\"\n}", ParameterType.RequestBody); -IRestResponse response = client.Execute(request); -``` - -```swift -import Foundation - -let headers = [ - "Authorization": "", - "Content-Type": "application/json" -] -let parameters = ["url": "https://dpgr.am/spacewalk.wavhttps://dpgr.am/spacewalk.wav"] as [String : Any] - -let postData = JSONSerialization.data(withJSONObject: parameters, options: []) - -let request = NSMutableURLRequest(url: NSURL(string: "https://api.deepgram.com/v1/listen")! as URL, - cachePolicy: .useProtocolCachePolicy, - timeoutInterval: 10.0) -request.httpMethod = "POST" -request.allHTTPHeaderFields = headers -request.httpBody = postData as Data - -let session = URLSession.shared -let dataTask = session.dataTask(with: request as URLRequest, completionHandler: { (data, response, error) -> Void in - if (error != nil) { - print(error as Any) - } else { - let httpResponse = response as? HTTPURLResponse - print(httpResponse) - } -}) - -dataTask.resume() -``` diff --git a/keyBoard/Class/AiTalk/websocket-api.md b/keyBoard/Class/AiTalk/websocket-api.md deleted file mode 100644 index 1373f4e..0000000 --- a/keyBoard/Class/AiTalk/websocket-api.md +++ /dev/null @@ -1,771 +0,0 @@ -# 实时语音对话 WebSocket API 文档 - -> Version: 2.0.0 (Flux) -> Last Updated: 2026-01-21 -> Author: Backend Team - ---- - -## 概述 - -本文档描述实时语音对话 WebSocket API,用于 iOS 客户端与后端进行实时语音交互。 - -**v2.0 更新**: 升级为 Deepgram Flux 模型,支持智能轮次检测和 EagerEndOfTurn 提前响应。 - -### 核心特性 -- **智能轮次检测**: Flux 模型语义理解,自动判断用户说完(非简单静默检测) -- **EagerEndOfTurn**: 提前启动 LLM 响应,进一步降低延迟 -- **实时语音识别**: 边说边识别,实时显示转写文本 -- **流式响应**: AI 响应边生成边返回,无需等待完整响应 -- **流式音频**: TTS 音频边合成边播放,极低延迟 -- **Barge-in 支持**: 用户可以打断 AI 说话 - -### 性能指标 -| 指标 | 目标值 | 说明 | -|------|--------|------| -| 端点检测延迟 | ~260ms | Flux 智能检测 | -| TTFA (首音频延迟) | < 300ms | EagerEndOfTurn 优化 | -| 端到端延迟 | < 1.5秒 | 完整对话周期 | -| 实时转写延迟 | < 100ms | 中间结果 | - ---- - -## 连接信息 - -### WebSocket 端点 - -``` -生产环境: wss://api.yourdomain.com/api/ws/chat?token={sa_token} -开发环境: ws://localhost:7529/api/ws/chat?token={sa_token} -``` - -### 认证方式 - -通过 URL Query 参数传递 Sa-Token: - -``` -ws://host:port/api/ws/chat?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9... -``` - -| 参数 | 类型 | 必填 | 描述 | -|------|------|------|------| -| token | String | ✅ | Sa-Token 登录令牌,通过 Apple Sign-In 获取 | - -### 认证失败 - -如果 token 无效或过期,WebSocket 连接将被拒绝(HTTP 403)。 - ---- - -## 消息格式 - -### 通用规则 - -1. **文本消息**: JSON 格式,用于控制指令和状态通知 -2. **二进制消息**: 原始字节,用于音频数据传输 -3. **编码**: UTF-8 - ---- - -## 客户端 → 服务端消息 - -### 1. 开始会话 (session_start) - -**发送时机**: 建立 WebSocket 连接后,准备开始录音前 - -```json -{ - "type": "session_start", - "config": { - "language": "en", - "voice_id": "a5zfmqTslZJBP0jutmVY" - } -} -``` - -| 字段 | 类型 | 必填 | 描述 | -|------|------|------|------| -| type | String | ✅ | 固定值 `session_start` | -| config | Object | ❌ | 会话配置(可选) | -| config.language | String | ❌ | 语音识别语言,默认 `en` | -| config.voice_id | String | ❌ | TTS 声音 ID,默认使用服务端配置 | - -**响应**: 服务端返回 `session_started` 消息 - ---- - -### 2. 音频数据 (Binary) - -**发送时机**: 用户正在录音时,持续发送音频数据 - -**格式**: Binary WebSocket Frame,直接发送原始音频字节 - -**音频规格要求**: - -| 参数 | 值 | 说明 | -|------|------|------| -| 编码格式 | PCM (Linear16) | 未压缩的脉冲编码调制 | -| 采样率 | 16000 Hz | 16kHz | -| 位深度 | 16-bit | 有符号整数 | -| 声道数 | 1 (Mono) | 单声道 | -| 字节序 | Little-Endian | 小端序 | - -**iOS 代码示例**: - -```swift -// AVAudioEngine 配置 -let format = AVAudioFormat( - commonFormat: .pcmFormatInt16, - sampleRate: 16000, - channels: 1, - interleaved: true -)! - -// 发送音频数据 -audioEngine.inputNode.installTap( - onBus: 0, - bufferSize: 1024, - format: format -) { buffer, time in - let audioData = buffer.int16ChannelData![0] - let byteCount = Int(buffer.frameLength) * 2 // 16-bit = 2 bytes - let data = Data(bytes: audioData, count: byteCount) - - webSocket.write(data: data) -} -``` - -**发送频率**: 建议每 20-100ms 发送一次,每次 320-1600 字节 - ---- - -### 3. 结束录音 (audio_end) - -**发送时机**: 用户停止录音(松开录音按钮) - -```json -{ - "type": "audio_end" -} -``` - -| 字段 | 类型 | 必填 | 描述 | -|------|------|------|------| -| type | String | ✅ | 固定值 `audio_end` | - -**说明**: 发送此消息后,服务端将完成语音识别并开始生成 AI 响应 - ---- - -### 4. 取消会话 (cancel) - -**发送时机**: 用户主动取消对话(如点击取消按钮) - -```json -{ - "type": "cancel" -} -``` - -| 字段 | 类型 | 必填 | 描述 | -|------|------|------|------| -| type | String | ✅ | 固定值 `cancel` | - -**说明**: 服务端将停止所有处理,不再返回任何消息 - ---- - -## 服务端 → 客户端消息 - -### 1. 会话已启动 (session_started) - -**接收时机**: 发送 `session_start` 后 - -```json -{ - "type": "session_started", - "session_id": "abc123-def456-ghi789" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `session_started` | -| session_id | String | 服务端分配的会话 ID | - -**客户端处理**: 收到此消息后,可以开始发送音频数据 - ---- - -### 2. 轮次开始 (turn_start) 🆕 - -**接收时机**: 用户开始说话时(Flux 检测到语音活动) - -```json -{ - "type": "turn_start", - "turn_index": 0 -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `turn_start` | -| turn_index | Integer | 当前轮次索引(从 0 开始) | - -**客户端处理**: -- 可显示"正在听..."状态 -- 准备接收转写结果 - ---- - -### 3. 中间转写结果 (transcript_interim) - -**接收时机**: 用户说话过程中,实时返回 - -```json -{ - "type": "transcript_interim", - "text": "Hello how are", - "is_final": false -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `transcript_interim` | -| text | String | 当前识别到的文本(可能会变化) | -| is_final | Boolean | 固定为 `false` | - -**客户端处理**: -- 实时更新 UI 显示转写文本 -- 此文本可能会被后续消息覆盖 -- 可用于显示"正在识别..."效果 - ---- - -### 3. 最终转写结果 (transcript_final) - -**接收时机**: 一句话识别完成时 - -```json -{ - "type": "transcript_final", - "text": "Hello, how are you?" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `transcript_final` | -| text | String | 最终确定的转写文本 | - -**客户端处理**: -- 用此文本替换之前的中间结果 -- 此文本不会再变化 - ---- - -### 6. 提前端点检测 (eager_eot) 🆕 - -**接收时机**: Flux 检测到用户可能说完时(置信度达到阈值) - -```json -{ - "type": "eager_eot", - "transcript": "Hello, how are you", - "confidence": 0.65 -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `eager_eot` | -| transcript | String | 当前转写文本 | -| confidence | Double | 端点置信度 (0.0-1.0) | - -**客户端处理**: -- 这是一个**预测性事件**,表示用户可能说完了 -- 服务端已开始提前准备 LLM 响应 -- 可显示"准备响应..."状态 -- **注意**: 用户可能继续说话,此时会收到 `turn_resumed` - ---- - -### 7. 轮次恢复 (turn_resumed) 🆕 - -**接收时机**: 收到 `eager_eot` 后,用户继续说话 - -```json -{ - "type": "turn_resumed" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `turn_resumed` | - -**客户端处理**: -- 用户继续说话,之前的 `eager_eot` 是误判 -- 服务端已取消正在准备的草稿响应 -- 恢复"正在听..."状态 -- 继续接收 `transcript_interim` 更新 - ---- - -### 8. LLM 开始生成 (llm_start) - -**接收时机**: 语音识别完成,AI 开始生成响应 - -```json -{ - "type": "llm_start" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `llm_start` | - -**客户端处理**: -- 可显示"AI 正在思考..."状态 -- 准备接收 AI 响应文本和音频 - ---- - -### 5. LLM Token (llm_token) - -**接收时机**: AI 生成过程中,逐 token 返回 - -```json -{ - "type": "llm_token", - "token": "Hi" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `llm_token` | -| token | String | AI 输出的单个 token(词或字符片段) | - -**客户端处理**: -- 可选择实现打字机效果 -- 逐个 token 追加显示 AI 响应文本 -- 如不需要打字效果,可忽略此消息 - ---- - -### 6. 音频数据 (Binary) - -**接收时机**: TTS 合成过程中,流式返回音频 - -**格式**: Binary WebSocket Frame,MP3 音频块 - -**音频规格**: - -| 参数 | 值 | -|------|------| -| 格式 | MP3 | -| 采样率 | 44100 Hz | -| 比特率 | 64 kbps | -| 声道 | 单声道 | - -**客户端处理**: - -```swift -// 使用 AVAudioEngine 或 AudioQueue 播放流式音频 -webSocket.onEvent = { event in - switch event { - case .binary(let data): - // 方案1: 追加到缓冲区,使用 AVAudioPlayerNode - audioBuffer.append(data) - playBufferedAudio() - - // 方案2: 使用 AVAudioEngine + AVAudioCompressedBuffer - // 方案3: 累积后使用 AVAudioPlayer - } -} -``` - -**重要提示**: -- 音频是分块返回的,需要正确拼接或流式播放 -- 每个二进制消息是 MP3 数据的一部分 -- 收到 `complete` 消息后,音频传输完成 - ---- - -### 7. 处理完成 (complete) - -**接收时机**: AI 响应生成完成,所有音频已发送 - -```json -{ - "type": "complete", - "transcript": "Hello, how are you?", - "ai_response": "Hi! I'm doing great, thanks for asking!" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `complete` | -| transcript | String | 完整的用户语音转写文本 | -| ai_response | String | 完整的 AI 响应文本 | - -**客户端处理**: -- 更新 UI 显示完整对话 -- 可开始下一轮对话 -- 建议保存对话历史 - ---- - -### 8. 错误 (error) - -**接收时机**: 处理过程中发生错误 - -```json -{ - "type": "error", - "code": "DEEPGRAM_ERROR", - "message": "Speech recognition failed" -} -``` - -| 字段 | 类型 | 描述 | -|------|------|------| -| type | String | 固定值 `error` | -| code | String | 错误代码 | -| message | String | 错误描述 | - -**错误代码列表**: - -| 错误代码 | 描述 | 建议处理 | -|----------|------|----------| -| PARSE_ERROR | 消息解析失败 | 检查消息格式 | -| DEEPGRAM_ERROR | 语音识别服务错误 | 重试或提示用户 | -| DEEPGRAM_INIT_ERROR | 语音识别初始化失败 | 重新开始会话 | -| LLM_ERROR | AI 生成错误 | 重试或提示用户 | -| PIPELINE_ERROR | 处理流程错误 | 重新开始会话 | -| EMPTY_TRANSCRIPT | 未检测到语音 | 提示用户重新说话 | - -**客户端处理**: -- 显示友好的错误提示 -- 根据错误类型决定是否重试 - ---- - -## 完整交互流程 - -### 时序图 - -``` -iOS Client Server - | | - |------ WebSocket Connect --------->| - | ?token=xxx | - | | - |<-------- Connected ---------------| - | | - |------ session_start ------------->| - | | - |<----- session_started ------------| - | {session_id: "abc"} | - | | - |======= 用户开始说话 ===============| - | | - |------ Binary (audio) ------------>| - |------ Binary (audio) ------------>| - |<----- transcript_interim ---------| - | {text: "Hello"} | - |------ Binary (audio) ------------>| - |<----- transcript_interim ---------| - | {text: "Hello how"} | - |------ Binary (audio) ------------>| - |<----- transcript_final -----------| - | {text: "Hello, how are you?"}| - | | - |======= 用户停止说话 ===============| - | | - |------ audio_end ----------------->| - | | - |<----- llm_start ------------------| - | | - |<----- llm_token ------------------| - | {token: "Hi"} | - |<----- llm_token ------------------| - | {token: "!"} | - |<----- Binary (mp3) ---------------| - |<----- Binary (mp3) ---------------| - |<----- llm_token ------------------| - | {token: " I'm"} | - |<----- Binary (mp3) ---------------| - | ... | - |<----- complete -------------------| - | {transcript, ai_response} | - | | - |======= 可以开始下一轮 =============| - | | -``` - ---- - -## iOS 代码示例 - -### 完整 Swift 实现 - -```swift -import Foundation -import Starscream // WebSocket 库 - -class VoiceChatManager: WebSocketDelegate { - - private var socket: WebSocket? - private var audioBuffer = Data() - - // MARK: - 回调 - var onSessionStarted: ((String) -> Void)? - var onTranscriptInterim: ((String) -> Void)? - var onTranscriptFinal: ((String) -> Void)? - var onLLMStart: (() -> Void)? - var onLLMToken: ((String) -> Void)? - var onAudioChunk: ((Data) -> Void)? - var onComplete: ((String, String) -> Void)? - var onError: ((String, String) -> Void)? - - // MARK: - 连接 - func connect(token: String) { - let urlString = "wss://api.yourdomain.com/api/ws/chat?token=\(token)" - guard let url = URL(string: urlString) else { return } - - var request = URLRequest(url: url) - request.timeoutInterval = 30 - - socket = WebSocket(request: request) - socket?.delegate = self - socket?.connect() - } - - func disconnect() { - socket?.disconnect() - socket = nil - } - - // MARK: - 发送消息 - func startSession(language: String = "en", voiceId: String? = nil) { - var config: [String: Any] = ["language": language] - if let voiceId = voiceId { - config["voice_id"] = voiceId - } - - let message: [String: Any] = [ - "type": "session_start", - "config": config - ] - - sendJSON(message) - } - - func sendAudio(_ data: Data) { - socket?.write(data: data) - } - - func endAudio() { - sendJSON(["type": "audio_end"]) - } - - func cancel() { - sendJSON(["type": "cancel"]) - } - - private func sendJSON(_ dict: [String: Any]) { - guard let data = try? JSONSerialization.data(withJSONObject: dict), - let string = String(data: data, encoding: .utf8) else { return } - socket?.write(string: string) - } - - // MARK: - WebSocketDelegate - func didReceive(event: WebSocketEvent, client: WebSocketClient) { - switch event { - case .connected(_): - print("WebSocket connected") - - case .disconnected(let reason, let code): - print("WebSocket disconnected: \(reason) (\(code))") - - case .text(let text): - handleTextMessage(text) - - case .binary(let data): - // 收到 MP3 音频数据 - onAudioChunk?(data) - - case .error(let error): - print("WebSocket error: \(error?.localizedDescription ?? "unknown")") - - default: - break - } - } - - private func handleTextMessage(_ text: String) { - guard let data = text.data(using: .utf8), - let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], - let type = json["type"] as? String else { return } - - switch type { - case "session_started": - if let sessionId = json["session_id"] as? String { - onSessionStarted?(sessionId) - } - - case "transcript_interim": - if let text = json["text"] as? String { - onTranscriptInterim?(text) - } - - case "transcript_final": - if let text = json["text"] as? String { - onTranscriptFinal?(text) - } - - case "llm_start": - onLLMStart?() - - case "llm_token": - if let token = json["token"] as? String { - onLLMToken?(token) - } - - case "complete": - if let transcript = json["transcript"] as? String, - let aiResponse = json["ai_response"] as? String { - onComplete?(transcript, aiResponse) - } - - case "error": - if let code = json["code"] as? String, - let message = json["message"] as? String { - onError?(code, message) - } - - default: - print("Unknown message type: \(type)") - } - } -} -``` - -### 使用示例 - -```swift -class VoiceChatViewController: UIViewController { - - let chatManager = VoiceChatManager() - let audioRecorder = AudioRecorder() // 自定义录音类 - let audioPlayer = StreamingAudioPlayer() // 自定义流式播放类 - - override func viewDidLoad() { - super.viewDidLoad() - setupCallbacks() - } - - func setupCallbacks() { - chatManager.onSessionStarted = { [weak self] sessionId in - print("Session started: \(sessionId)") - // 开始录音 - self?.audioRecorder.start { audioData in - self?.chatManager.sendAudio(audioData) - } - } - - chatManager.onTranscriptInterim = { [weak self] text in - self?.transcriptLabel.text = text + "..." - } - - chatManager.onTranscriptFinal = { [weak self] text in - self?.transcriptLabel.text = text - } - - chatManager.onLLMStart = { [weak self] in - self?.statusLabel.text = "AI is thinking..." - } - - chatManager.onLLMToken = { [weak self] token in - self?.aiResponseLabel.text = (self?.aiResponseLabel.text ?? "") + token - } - - chatManager.onAudioChunk = { [weak self] data in - self?.audioPlayer.appendData(data) - } - - chatManager.onComplete = { [weak self] transcript, aiResponse in - self?.statusLabel.text = "Complete" - self?.addToHistory(user: transcript, ai: aiResponse) - } - - chatManager.onError = { [weak self] code, message in - self?.showError(message) - } - } - - @IBAction func startTapped(_ sender: UIButton) { - // 连接并开始会话 - chatManager.connect(token: AuthManager.shared.saToken) - chatManager.onSessionStarted = { [weak self] _ in - self?.chatManager.startSession() - } - } - - @IBAction func stopTapped(_ sender: UIButton) { - audioRecorder.stop() - chatManager.endAudio() - } - - @IBAction func cancelTapped(_ sender: UIButton) { - audioRecorder.stop() - audioPlayer.stop() - chatManager.cancel() - } -} -``` - ---- - -## 注意事项 - -### 1. 音频录制 -- 必须使用 PCM 16-bit, 16kHz, Mono 格式 -- 建议每 20-100ms 发送一次音频数据 -- 录音权限需要在 Info.plist 中声明 - -### 2. 音频播放 -- 返回的是 MP3 格式音频块 -- 需要实现流式播放或缓冲播放 -- 建议使用 AVAudioEngine 实现低延迟播放 - -### 3. 网络处理 -- 实现自动重连机制 -- 处理网络切换场景 -- 设置合理的超时时间 - -### 4. 用户体验 -- 显示实时转写文本 -- 显示 AI 响应状态 -- 提供取消按钮 -- 处理录音权限被拒绝的情况 - -### 5. 调试建议 -- 使用 `wss://` 确保生产环境安全 -- 本地开发可使用 `ws://` -- 检查 Sa-Token 是否过期 - ---- - -## 版本历史 - -| 版本 | 日期 | 变更 | -|------|------|------| -| 1.0.0 | 2026-01-21 | 初始版本 | -