如何从基于运算符 C# 的源代码中标记语法

问题描述

我正在读取 TextBox 中的所有行,并尝试删除列表中的所有空格。

我需要能够标记以下表达式:

if(x==0)
{
    cout<<x;
} 

进入

if
(
x
==
0
)
{
cout
<<
x
;
} 

我的代码

public static string[] Tokenize(string sourceCode)
{
    Regex RE = new Regex(@"([\s+\+\-\*\%\,\;\&\|\<\>\=\!\{\}])");
    string[] x = RE.Split(sourceCode);

    var list = new List<string>(x);
    list.Remove(" ");

    for (int m = 0; m < list.Count(); m++)
    {
        Console.WriteLine(list[m]);
    }

    return (RE.Split(sourceCode));
}

我的输出

if(x
=

=
0)






{








 

 

 
cout
<

<
x
;







}

如何使用 == << && 之类的符号进行拆分以及如何从列表中删除空格? 有没有更好的方法来实现我想要的?

解决方法

我同意@juharr 的评论。 但是,如果您真的想使用正则表达式,最好使用 Match 方法而不是 Split,因为它允许您指定要查找的标记而不是标记边界:

 Regex RE = new Regex(@"\w+|\(|\)|\++|-+|\*|%|,|;|&+|\|+|<+|>+|=+|!|\{|\}");
 foreach (Match m in RE.Matches(sourceCode))
 {
  Console.WriteLine(m.Value);
 }

结果:

if
(
x
==
0
)
{
cout
<<
x
;
}
,

你可以这样做:

var rx = new Regex(@"([\p{L}_][\p{L}\p{N}_]*|[+-]?[0-9]+|==|!=|>=|<=|<<|>>|\|\||&&|[!=+\-*/%{}();]|\s+)*");
Match match = rx.Match(str);
Group g = match.Groups[1];
foreach (var capture in g.Captures)
{
    Console.WriteLine(capture);
}

(与您的示例相比,我已经包含了许多其他运算符)。这仍然是一个坏主意。

现在......这仍然是一个坏主意,但你可以让它变得更复杂:

string str = @"if(x==0)
{
    cout<<x;
    var x1 = '\a';
    var x2 = '\'';
    var x3 = 'X';
    var x4 = ""He\""llo\n"";
}";

var fragments = new[]
{
    // The order of these pattern is important! Longer patterns should go first (so += before + for example)
    new { Name = "Keyword",Pattern = @"(?:if|for|while|var|int|long|string|char|return)\b",Escape = false },new { Name = "Symbol",Pattern = @"[\p{L}_][\p{L}\p{N}_]*\b",new { Name = "Number",Pattern = @"[+-]?[0-9]+(?:\.[0-9]+)?\b",new { Name = "OperatorAssign",Pattern = @"<<=|>>=|&&=|\|\|=|[+\-*/%&|^]=",new { Name = "Operator",Pattern = @"==|!=|>=|<=|>|<|<<|>>|&&|\|\||[+\-*/%&|^!]",new { Name = "Space",Pattern = @"\s+",new { Name = "Assign",Pattern = @"=",Escape = true },new { Name = "OpenBrace",Pattern = @"{",new { Name = "CloseBrace",Pattern = @"}",new { Name = "Semicolon",Pattern = @";",new { Name = "OpenRoundParenthesis",Pattern = @"(",new { Name = "CloseRoundParenthesis",Pattern = @")",new { Name = "OpenSquareParenthesis",Pattern = @"[",new { Name = "CloseSquareParenthesis",Pattern = @"]",new { Name = "Char",Pattern = @"'(?:\\.|.)'",new { Name = "String",Pattern = @"\""(?:\\.|[^""])*""",};

string allPatterns = string.Join('|',fragments.Select(x => $"(?<{x.Name}>{(x.Escape ? Regex.Escape(x.Pattern) : x.Pattern)})"));
var rx = new Regex(@"\G(?:" + allPatterns + ")");

int ix = 0;

while (ix < str.Length)
{
    var match = rx.Match(str,ix);

    if (!match.Success)
    {
        Console.WriteLine($"Error starting at: {str.Substring(ix)}");
        break;
    }

    var group = match.Groups.OfType<Group>().Skip(1).Single(x => x.Success);

    string name = group.Name;
    string value = match.Value;

    if (name != "Space")
    {
        Console.WriteLine($"Match: {name}: {value}");
    }
    else
    {
        Console.WriteLine("Skipping some space");
    }

    ix += value.Length;
}