问题描述
我确实匹配文本中的单词,以检索单词offset的开始和结束。当使用适当的支持Unicode的正则表达式(如 '(?<=^|\\PL)$1(?=\\PL|$)'
)时,这通常适用于ASCII文本和Unicode文本。当我混合使用文字(例如此处的韩语和英语)时,在标记时会出现一些问题:
function aggressive_tokenizer(text) {
// most punctuation
text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g," $1 ");
// commas if followed by space
text = text.replace(/(,\s)/g," $1");
// single quotes if followed by a space
text = text.replace(/('\s)/g," $1");
// single quotes if last char
text = text.replace(/('$)/," $1");
text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g," $2")
// periods before newline or end of string
text = text.replace(/\. *(\n|$)/g," . ");
// replace punct
// ignore "-" since may be in slang scream
text = text.replace(/[\\?\^%<>=!&|+\~]/g,"");
text = text.replace(/[…;,.:*#\)\({}\[\]]/g,"");
// finally split remainings into words
text = text.split(/\s+/)
return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
let item = {
"word": token
}
var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g,"\\$&");
var wordRegex = new XRegExp(pattern.replace('$1',escaped),"g");
// calculate token begin end
var match = null;
while ((match = wordRegex.exec(text)) !== null) {
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token,wordEnd);
break;
}
}
return item;
});
indexes.forEach(index => {
if (index.word != text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1)) {
console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1))
} else {
console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1))
}
});
<script src="https://unpkg.com/xregexp/xregexp-all.js"></script>
问题是我在令牌化方面做了一些清理工作
text = text.replace(/([^\w\.\'\-\/\+\<\>," $1 ");
其中\w
不支持Unicode,但是如果我将其替换为\p{Alnum}
:
text = text.replace(/([^\p{Alnum}\.\'\-\/\+\<\>," $1 ");
它应该与Unicode字等效,无法正常工作。
注意 请注意,我确实使用XRegExp在JavaScript中支持Unicode正则表达式。
更新
根据下面的评论,由于缺少对宽度可变的后向模式的支持(请参见评论),我使用了经过修改的模式regexp '(?<=^|\\PL)$1(?=\\PL|$)'
更新了– Wiktor Stribiżew,并用内置的RegExp替换了XRegExp。 。
此解决方案效果更好,但是我发现了另外一种情况,即对于给定的输入文本,char偏移的开始和结尾无法匹配:"점점 더 깊이 끌려가"
输出将缺少/匹配的偏移量
{
"index": 2,"word": "점"
}
function aggressive_tokenizer(text) {
// most punctuation
text = text.replace(/[^\w\.\-\/\+\<\>,&]/g," $& ");
// commas if followed by space
text = text.replace(/(,"\\$&");
var wordRegex = new RegExp(pattern.replace('$1',wordEnd);
break;
}
}
return item;
});
indexes.forEach(index => {
if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
console.log("MISSING INDEXES " + index.word);
} else if (index.word != text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1))
}
});
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)