问题描述
我想解析这样的字符串:
Object A -> Object B [AB_name] Object B -> Object C [BC_name] ...
我的目标是获得三个列表:
父母 = ['对象 A','对象 B',...]
儿童 = ['对象 B','对象 C',...]
PC_names = ['AB_name','BC_name',...]
我已经有了一个可行的解决方案,但它难以理解,因此难以维护且不是很健壮。基本上我的代码在字符串上迭代了两个循环,从而将子字符串添加到几个列表中。
为了解决这个问题,我阅读了 parsec.py 库,但到目前为止我找不到像我这样的新手的好例子。我已经尝试在其他文章和文档的帮助下弄清楚它是如何工作的,但目前收效甚微。
test_input.txt:
Society {
A man -> a child [was once]
A man -> an automobile [has]
A woman -> a person [is]
A man -> a person [is]
A man -> a child [was once]
}
我当前的代码:
from typing import List
from parsec import *
class Type(object):
label: str
class Aspect(object):
domain: Type
codomain: Type
label: str
def __init__(self) -> None:
self.domain = Type()
self.codomain = Type()
class Olog(object):
name: str
aspects: List[Aspect]
def __init__(self):
self.aspects = []
with open ('testinput.txt','r') as f:
f_content = f.read()
olog_name = f_content.split("{")[0]
first = Olog()
first.aspects = []
first.name = olog_name
olog_data = f_content.split("{")[1]
olog_data_lines = olog_data.split(']')
orientation = str
counter1 = 0
counter2 = 0
domain_str = ''
codomain_str = ''
type_comma = Type()
type_comma.label = ","
string_store = str
string_store = ''
type_store = Type()
type_store_split = [Type]
for lines in olog_data_lines:
first_type = ''
second_type = ''
aspect_label = str
first_T = Type()
second_T = Type()
lines += ']'
lines_split = lines.split()
type_in_list = False
for word in lines_split:
if word == "}" and counter1 == 0:
print("Olog is empty")
if word == "}":
print(">>>Olog was saved")
break
if word == "->":
counter1 +=1
if counter1 == counter2 and lines_split.index(word) == 0:
first_type = word
if counter1 == counter2 and not lines_split.index(word) == 0:
first_type = first_type + (" " + word)
if word == "->":
orientation = "->"
string_store = string_store + first_type + ","
type_store.label = string_store
type_store_split = type_store.label.split(",")
for types in type_store_split:
if types == first_type:
domain_str = int(type_store_split.index(types))
type_in_list = True
break
if not type_in_list:
domain_str = int(len(type_store_split)-2)
if not counter1 == counter2:
if word[0] == "[":
aspect_label = (lines.split('[',1)[1].split(']')[0])
else: second_type = second_type.replace('->','',1) + " " + word
if (word[len(word)-1]=="]"):
second_T.label = second_type
string_store = string_store + second_type + ","
type_store.label = string_store
type_store_split = type_store.label.split(",")
for types in type_store_split:
if types == second_type:
codomain_str = int(type_store_split.index(types))
second_T.label = codomain_str
break
elif types == type_store_split[len(type_store_split)-1]:
codomain_str = int(len(type_store_split)-2)
second_T.label = codomain_str
aspect_A = Aspect()
aspect_A.label = aspect_label
aspect_A.domain = Type()
aspect_A.codomain = Type()
aspect_A.domain.label = domain_str
aspect_A.codomain.label = codomain_str
first.aspects.append(aspect_A)
counter2 += 1
``
解决方法
此解决方案使用 re
和递归解析输入行并遍历结果,yield
返回父级、子级和 pc_names
:
import re,collections
def parse_line(l):
return [re.findall('\[.*?\]|[\w\s]+',i.strip()) for i in re.split('\s*\-\>\s*',l)]
lines = [parse_line(i) for i in open('test_input.txt') if not re.findall('[\{\}]',i)]
def get_vals(d,s = []):
if len(d) > 1:
yield ('pc_names',d[-1][1:-1])
if not (c:=[b for a,b in lines if d[0] == a[0]]):
yield ('children',d[0])
if (k:=[a for a,_ in lines if a[0] not in s]):
yield from get_vals(k[0],s+[d[0]])
else:
yield ('parents',d[0])
for i in c:
yield from get_vals(i,s+[d[0]])
result = collections.defaultdict(set)
for a,b in get_vals(lines[0][0]):
result[a].add(b)
print({a:list(b) for a,b in result.items()})
输出:
{'parents': ['A woman','A man'],'pc_name': ['was once','is','has'],'children': ['a person ','an automobile ','a child ']}
第二个test_input.txt
内容:
Object A -> Object B [AB_name]
Object B -> Object C [BC_name]
结果:
{'parents': ['Object B','Object A'],'pc_names': ['AB_name','BC_name'],'children': ['Object B ','Object C ']}