带双字节字符的单词的Javascript单词边界正则表达式，带“ \ w”

问题描述

我确实匹配文本中的单词，以检索单词offset的开始和结束。当使用适当的支持Unicode的正则表达式（如 '(?<=^|\\PL)$1(?=\\PL|$)'）时，这通常适用于ASCII文本和Unicode文本。当我混合使用文字（例如此处的韩语和英语）时，在标记时会出现一些问题：

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g," $1 ");
  // commas if followed by space
  text = text.replace(/(,\s)/g," $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g," $1");
  // single quotes if last char
  text = text.replace(/('$)/," $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g," $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g," . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g,"");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g,"");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g,"\\$&");
  var wordRegex = new XRegExp(pattern.replace('$1',escaped),"g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token,wordEnd);
      break;
    }
  }
  return item;
});
indexes.forEach(index => {
  if (index.word != text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1)) {
    console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1))
  } else {
    console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1))
  }
});

<script src="https://unpkg.com/xregexp/xregexp-all.js"></script>

问题是我在令牌化方面做了一些清理工作

text = text.replace(/([^\w\.\'\-\/\+\<\>," $1 ");

其中\w不支持Unicode，但是如果我将其替换为\p{Alnum}：

text = text.replace(/([^\p{Alnum}\.\'\-\/\+\<\>," $1 ");

它应该与Unicode字等效，无法正常工作。

注意请注意，我确实使用XRegExp在JavaScript中支持Unicode正则表达式。

更新根据下面的评论，由于缺少对宽度可变的后向模式的支持（请参见评论），我使用了经过修改的模式regexp '(?<=^|\\PL)$1(?=\\PL|$)'更新了– Wiktor Stribiżew，并用内置的RegExp替换了XRegExp。。此解决方案效果更好，但是我发现了另外一种情况，即对于给定的输入文本，char偏移的开始和结尾无法匹配："점점 더 깊이 끌려가"输出将缺少/匹配的偏移量

{
    "index": 2,"word": "점"
}

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/[^\w\.\-\/\+\<\>,&]/g," $& ");
  // commas if followed by space
  text = text.replace(/(,"\\$&");
  var wordRegex = new RegExp(pattern.replace('$1',wordEnd);
      break;
    }
  }
  return item;
});
indexes.forEach(index => {
  if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
    console.log("MISSING INDEXES " + index.word);
  } else if (index.word != text.slice(index.characterOffsetBegin,index.characterOffsetEnd + 1))
  }
});

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

javascript regex regex regex tokenize unicode xregexp