149 lines
3.9 KiB
Mathematica
149 lines
3.9 KiB
Mathematica
|
|
//
|
|||
|
|
// Segmenter.m
|
|||
|
|
// keyBoard
|
|||
|
|
//
|
|||
|
|
// Created by Mac on 2026/1/15.
|
|||
|
|
//
|
|||
|
|
|
|||
|
|
#import "Segmenter.h"
|
|||
|
|
|
|||
|
|
@interface Segmenter ()
|
|||
|
|
|
|||
|
|
@property(nonatomic, strong) NSMutableString *buffer;
|
|||
|
|
@property(nonatomic, strong) NSMutableArray<NSString *> *readySegments;
|
|||
|
|
|
|||
|
|
@end
|
|||
|
|
|
|||
|
|
@implementation Segmenter
|
|||
|
|
|
|||
|
|
- (instancetype)init {
|
|||
|
|
self = [super init];
|
|||
|
|
if (self) {
|
|||
|
|
_buffer = [[NSMutableString alloc] init];
|
|||
|
|
_readySegments = [[NSMutableArray alloc] init];
|
|||
|
|
_maxCharacterThreshold = 30;
|
|||
|
|
}
|
|||
|
|
return self;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#pragma mark - Public Methods
|
|||
|
|
|
|||
|
|
- (void)appendToken:(NSString *)token {
|
|||
|
|
if (!token || token.length == 0) {
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
[self.buffer appendString:token];
|
|||
|
|
|
|||
|
|
// 检查是否需要切分
|
|||
|
|
[self checkAndSplit];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
- (NSArray<NSString *> *)popReadySegments {
|
|||
|
|
NSArray *segments = [self.readySegments copy];
|
|||
|
|
[self.readySegments removeAllObjects];
|
|||
|
|
return segments;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
- (NSString *)flushRemainingSegment {
|
|||
|
|
NSString *remaining = [self.buffer copy];
|
|||
|
|
[self.buffer setString:@""];
|
|||
|
|
|
|||
|
|
// 去除首尾空白
|
|||
|
|
remaining = [remaining
|
|||
|
|
stringByTrimmingCharactersInSet:[NSCharacterSet
|
|||
|
|
whitespaceAndNewlineCharacterSet]];
|
|||
|
|
|
|||
|
|
return remaining.length > 0 ? remaining : nil;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
- (void)reset {
|
|||
|
|
[self.buffer setString:@""];
|
|||
|
|
[self.readySegments removeAllObjects];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#pragma mark - Private Methods
|
|||
|
|
|
|||
|
|
- (void)checkAndSplit {
|
|||
|
|
// 句子结束标点
|
|||
|
|
NSCharacterSet *sentenceEnders =
|
|||
|
|
[NSCharacterSet characterSetWithCharactersInString:@"。!?\n"];
|
|||
|
|
|
|||
|
|
while (YES) {
|
|||
|
|
NSString *currentBuffer = self.buffer;
|
|||
|
|
|
|||
|
|
// 查找第一个句子结束标点
|
|||
|
|
NSRange range = [currentBuffer rangeOfCharacterFromSet:sentenceEnders];
|
|||
|
|
|
|||
|
|
if (range.location != NSNotFound) {
|
|||
|
|
// 找到结束标点,切分
|
|||
|
|
NSUInteger endIndex = range.location + 1;
|
|||
|
|
NSString *segment = [currentBuffer substringToIndex:endIndex];
|
|||
|
|
segment = [segment stringByTrimmingCharactersInSet:
|
|||
|
|
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|||
|
|
|
|||
|
|
if (segment.length > 0) {
|
|||
|
|
[self.readySegments addObject:segment];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 移除已切分的部分
|
|||
|
|
[self.buffer deleteCharactersInRange:NSMakeRange(0, endIndex)];
|
|||
|
|
} else if (currentBuffer.length >= self.maxCharacterThreshold) {
|
|||
|
|
// 未找到标点,但超过阈值,强制切分
|
|||
|
|
// 尝试在空格或逗号处切分
|
|||
|
|
NSRange breakRange = [self findBestBreakPoint:currentBuffer];
|
|||
|
|
|
|||
|
|
if (breakRange.location != NSNotFound) {
|
|||
|
|
NSString *segment =
|
|||
|
|
[currentBuffer substringToIndex:breakRange.location + 1];
|
|||
|
|
segment =
|
|||
|
|
[segment stringByTrimmingCharactersInSet:
|
|||
|
|
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|||
|
|
|
|||
|
|
if (segment.length > 0) {
|
|||
|
|
[self.readySegments addObject:segment];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
[self.buffer
|
|||
|
|
deleteCharactersInRange:NSMakeRange(0, breakRange.location + 1)];
|
|||
|
|
} else {
|
|||
|
|
// 无法找到合适的断点,直接切分
|
|||
|
|
NSString *segment =
|
|||
|
|
[currentBuffer substringToIndex:self.maxCharacterThreshold];
|
|||
|
|
segment =
|
|||
|
|
[segment stringByTrimmingCharactersInSet:
|
|||
|
|
[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|||
|
|
|
|||
|
|
if (segment.length > 0) {
|
|||
|
|
[self.readySegments addObject:segment];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
[self.buffer
|
|||
|
|
deleteCharactersInRange:NSMakeRange(0, self.maxCharacterThreshold)];
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
// 未达到切分条件
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
- (NSRange)findBestBreakPoint:(NSString *)text {
|
|||
|
|
// 优先在逗号、分号等处断开
|
|||
|
|
NSCharacterSet *breakChars =
|
|||
|
|
[NSCharacterSet characterSetWithCharactersInString:@",,、;;:: "];
|
|||
|
|
|
|||
|
|
// 从后往前查找,尽可能多包含内容
|
|||
|
|
for (NSInteger i = text.length - 1; i >= self.maxCharacterThreshold / 2;
|
|||
|
|
i--) {
|
|||
|
|
unichar c = [text characterAtIndex:i];
|
|||
|
|
if ([breakChars characterIsMember:c]) {
|
|||
|
|
return NSMakeRange(i, 1);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return NSMakeRange(NSNotFound, 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@end
|