使用 parsec.py

问题描述

我想解析这样的字符串：

Object A -> Object B [AB_name] Object B -> Object C [BC_name] ...

我的目标是获得三个列表：

父母 = ['对象 A'，'对象 B'，...]

儿童 = ['对象 B'，'对象 C'，...]

PC_names = ['AB_name','BC_name',...]

我已经有了一个可行的解决方案，但它难以理解，因此难以维护且不是很健壮。基本上我的代码在字符串上迭代了两个循环，从而将子字符串添加到几个列表中。

为了解决这个问题，我阅读了 parsec.py 库，但到目前为止我找不到像我这样的新手的好例子。我已经尝试在其他文章和文档的帮助下弄清楚它是如何工作的，但目前收效甚微。

我很高兴每一个提示。

test_input.txt：

Society {
    A man -> a child [was once]
    A man -> an automobile [has]
    A woman -> a person [is]
    A man -> a person [is]
    A man -> a child [was once] 
}

我当前的代码：

from typing import List
from parsec import *

class Type(object):
    label: str

class Aspect(object):
    domain: Type
    codomain: Type
    label: str
    def __init__(self) -> None:
        self.domain = Type()
        self.codomain = Type()
    
class Olog(object):
    name: str
    aspects: List[Aspect]
    def __init__(self):
        self.aspects = []

with open ('testinput.txt','r') as f:
        f_content = f.read()
        olog_name = f_content.split("{")[0]
        first = Olog()
        first.aspects = []
        first.name = olog_name
        olog_data = f_content.split("{")[1]
        olog_data_lines = olog_data.split(']')

        orientation = str

        counter1 = 0 
        counter2 = 0
        domain_str = ''
        codomain_str = ''
        type_comma = Type()
        type_comma.label = ","
        string_store = str
        string_store = ''
        type_store = Type()
        type_store_split = [Type]
        

        for lines in olog_data_lines:
            first_type = ''
            second_type = ''
            aspect_label = str
            first_T = Type()
            second_T = Type()
            lines += ']'
            lines_split = lines.split()
            type_in_list = False

            for word in lines_split:
                if word == "}" and counter1 == 0:
                        print("Olog is empty")
                if word == "}":
                        print(">>>Olog was saved")
                        break

                if word == "->":

                    counter1 +=1
                if counter1 == counter2 and lines_split.index(word) == 0:

                    first_type = word
                if counter1 == counter2 and not lines_split.index(word) == 0:
                    first_type = first_type + (" " + word)
                if  word == "->": 
                        orientation = "->"
                        string_store = string_store + first_type + ","

                        type_store.label = string_store
                        type_store_split = type_store.label.split(",")
                        
                        
                        for types in type_store_split: 
                                                
                            if types == first_type:

                                domain_str = int(type_store_split.index(types))
                                type_in_list = True
                                break
                        if not type_in_list:

                            domain_str = int(len(type_store_split)-2)

                if not counter1 == counter2:
                    if word[0] == "[":
                        aspect_label = (lines.split('[',1)[1].split(']')[0])
                    else: second_type = second_type.replace('->','',1) + " " + word
                    
                    if (word[len(word)-1]=="]"):
                        second_T.label = second_type
                        string_store = string_store + second_type + ","
                        type_store.label = string_store
                        type_store_split = type_store.label.split(",")

                        for types in type_store_split:               
                            if types == second_type:
                                codomain_str = int(type_store_split.index(types))
                                second_T.label = codomain_str
                                break
                            elif types == type_store_split[len(type_store_split)-1]:
                                codomain_str = int(len(type_store_split)-2)
                                second_T.label = codomain_str

                        aspect_A = Aspect()
                        aspect_A.label = aspect_label
                        aspect_A.domain = Type()
                        aspect_A.codomain = Type()
                        aspect_A.domain.label = domain_str
                        aspect_A.codomain.label = codomain_str
                        first.aspects.append(aspect_A)
                        counter2 += 1
                    
``

解决方法

此解决方案使用 re 和递归解析输入行并遍历结果，yield返回父级、子级和 pc_names：

import re,collections
def parse_line(l):
   return [re.findall('\[.*?\]|[\w\s]+',i.strip()) for i in re.split('\s*\-\>\s*',l)]

lines = [parse_line(i) for i in open('test_input.txt') if not re.findall('[\{\}]',i)]
def get_vals(d,s = []):
  if len(d) > 1:
     yield ('pc_names',d[-1][1:-1])
  if not (c:=[b for a,b in lines if d[0] == a[0]]):
     yield ('children',d[0])
     if (k:=[a for a,_ in lines if a[0] not in s]):
        yield from get_vals(k[0],s+[d[0]])
  else:
     yield ('parents',d[0])
     for i in c:
        yield from get_vals(i,s+[d[0]])

result = collections.defaultdict(set)
for a,b in get_vals(lines[0][0]):
   result[a].add(b)

print({a:list(b) for a,b in result.items()})

输出：

{'parents': ['A woman','A man'],'pc_name': ['was once','is','has'],'children': ['a person ','an automobile ','a child ']}

第二个test_input.txt内容：

Object A -> Object B [AB_name] 
Object B -> Object C [BC_name]

结果：

{'parents': ['Object B','Object A'],'pc_names': ['AB_name','BC_name'],'children': ['Object B ','Object C ']}

monads parsec parsing parsing python string string