问题描述
我正在处理一个杂乱的函数,以使其更加有效和可读。我的python技能充其量是初学者到中级,我想有一种更简洁的方法可以完成此任务。
下面的函数采用一个字符串,其中包含各种与业务联系人相关的信息。信息用冒号分隔。公司名称始终是第一个字段,因此可以轻松提取,但是其余的“列(冒号之间的数据)可能包含也可能不包含,并且并不总是相同的顺序。
该函数有两个参数,1)rowdata(包含下面示例的字符串)和2)我要返回的数据元素。
# Business Contact @R_113_4045@ion
def parseBusinessContact@R_113_4045@ion(self,rowdata,element):
## Process Business Contact @R_113_4045@ion
## example rowdata = "Business Name,LLC : Business DBA : Email- [email protected] : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name,LLC : Email- [email protected] : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name,LLC : Business DBA : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name,LLC : Phone- 1234567890"
businessName = None
businessDba = None
businessPhone = None
businessEmail = None
businessWebsite = None
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
businessName = contactData[0].strip()
## [1] - doing_business_as or another field if not present
if 1 < len(contactData) and re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and not re.search(r'(phone|email|website)',contactData[1].lower()):
businessDba = contactData[1].strip()
else:
businessDba = self.datanotAvailableMessage
## [2] - phone or email or website
if 2 < len(contactData) and re.search('email',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessEmail = contactTemp[1].strip()
elif 2 < len(contactData) and re.search('phone',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessPhone = contactTemp[1].strip()
elif 2 < len(contactData) and re.search('website',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessWebsite = contactTemp[1].strip()
## [3] - phone or email or website
if 3 < len(contactData) and re.search('email',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessEmail = contactTemp[1].strip()
elif 3 < len(contactData) and re.search('phone',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessPhone = contactTemp[1].strip()
elif 3 < len(contactData) and re.search('website',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessWebsite = contactTemp[1].strip()
if element == "businessName":
return businessName
elif element == "businessDba":
return businessDba
elif element == "businessPhone":
return businessPhone
elif element == "businessEmail":
return businessEmail
elif element == "businessWebsite":
return businessWebsite
else:
return self.datanotAvailableMessage
我正在尝试了解一种更好的方法。
解决方法
重构是一个累积过程。您在Refactoring中对Martin Fowler和Kent Beck的方法有了全面的描述。
它的心脏是一系列小的行为保留转换。 (马丁·福勒,https://refactoring.com/)
最重要的部分是:“小”和“行为保留”。 “小”一词是不言而喻的,但“行为保持”应通过单元测试来确保。
初步说明:我建议您坚持使用PEP 8 Style Guide。
保存行为
用文档字符串(https://www.python.org/dev/peps/pep-0008/#id33)替换您的评论。这非常有用,因为您可以在文档字符串(也称为doctests)中编写一些单元测试。
class MyParser:
dataNotAvailableMessage = "dataNotAvailableMessage"
# Business Contact Information
def parseBusinessContactInformation(self,rowdata,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name,LLC : Business DBA : Email- [email protected] : Phone- 1234567890 : Website- www.site.com","businessPhone")
'1234567890'
>>> p.parseBusinessContactInformation("Business Name,LLC : Email- [email protected] : Phone- 1234567890 : Website- www.site.com","businessName")
'Business Name,LLC'
>>> p.parseBusinessContactInformation("Business Name,LLC : Business DBA : Phone- 1234567890 : Website- www.site.com","businessDba")
'Business DBA'
>>> p.parseBusinessContactInformation("Business Name,LLC : Phone- 1234567890","businessEmail") is None
True
>>> p.parseBusinessContactInformation("Business Name,"?")
'dataNotAvailableMessage'
"""
...
import doctest
doctest.testmod()
您应该编写更多的单元测试(使用https://docs.python.org/3/library/unittest.html以避免泛滥文档字符串)来保护当前行为,但这是一个好的开始。
现在,进行一个小转换:查看这些(el)if 1 < len(contactData) and ...
行。您可以只测试一次长度:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif not re.search(r'(phone|email|website)',contactData[1].lower()):
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
else:
businessDba = self.dataNotAvailableMessage
您注意到倒数第二个else
无法访问:您是否拥有phone
,email
,website
:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
else:
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
对[2]和[3]做同样的事情:
if 2 < len(contactData):
if re.search('email',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessWebsite = contactTemp[1].strip()
if 3 < len(contactData):
if re.search('email',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessWebsite = contactTemp[1].strip()
现在您可以看到清晰的图案。除了第一部分分配businessDba
,您显然拥有相同过程的三倍。首先,我们在第一部分中隔离businessDba
的分配:
if 1 < len(contactData):
if re.search('(email|phone|website)',contactData[1].lower()):
businessDba = contactData[0].strip()
else:
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
然后:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
在继续之前,我们可以删除行
businessName = None
businessDba = None
由于businessName
和businessDba
始终具有值。并替换新行:
businessDba = contactData[0].strip()
通过:
businessDba = businessName
这使后备明显。
现在,我们有三遍相同的过程。循环是一个好主意:
for i in range(1,3):
if i >= len(contactData):
break
if re.search('email',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessWebsite = contactTemp[1].strip()
我们可以提取contactTemp =
,即使它并不总是有用:
for i in range(1,3):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if re.search('email',contactData[i].lower()):
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
businessWebsite = contactTemp[1].strip()
那更好,但是我发现最后一部分(if element == ...
)确实很麻烦:您针对所有可能性测试element
。这里有人想要一本字典。对于一个小的转换,我们可以编写:
d = {
"businessName": businessName,"businessDba": businessDba,"businessPhone": businessPhone,"businessEmail": businessEmail,"businessWebsite": businessWebsite
}
return d.get(element,self.dataNotAvailableMessage)
现在,我们无需创建dict末尾的字典,而是可以动态创建并更新它:
d = {
"businessPhone": None,"businessEmail": None,"businessWebsite": None
}
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
d["businessName"] = contactData[0].strip()
if 1 < len(contactData):
if re.search('(email|phone|website)',contactData[1].lower()):
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
else:
d["businessDba"] = self.dataNotAvailableMessage
for i in range(1,4):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if re.search('email',contactData[i].lower()):
d["businessEmail"] = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
d["businessPhone" = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
d["businessWebsite"] = contactTemp[1].strip()
return d.get(element,self.dataNotAvailableMessage)
我对每个修改都运行了测试,并且仍然可以运行,但是阅读起来并不容易。我们可以提取一个创建字典的函数:
def parseBusinessContactInformation(self,element):
d = self._parseBusinessContactInformation(rowdata)
return d.get(element,self.dataNotAvailableMessage)
def _parseBusinessContactInformation(self,rowdata):
...
行为变化很小
这还不错,但是我们可以通过小的行为更改来改善这一点(希望您对这种新行为没问题!):
for i in range(1,4):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if len(contactTemp) > 1:
d["business" + contactTemp[0].strip()] = contactTemp[1].strip()
行为变化是什么?简而言之,我们现在接受类似
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name,LLC : Business DBA : Foo- Bar","businessFoo")
'Bar'
由于我们接受更多element
,因此应更改循环range
:
for i in range(1,len(contactData)):
...
现在该关注一点不一致的地方了:为什么businessDba
会具有为不存在的元素创建的值self.dataNotAvailableMessage
?我们应该使用None
:
d = {
"businessDba": None,...
}
并删除这两行:
else:
d["businessDba"] = self.dataNotAvailableMessage
这可以简化:
if 1 < len(contactData):
if "-" in contactData[1]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
这是代码:
def parseBusinessContactInformation(self,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name,"businessPhone")
'1234567890'
>>> p.parseBusinessContactInformation("Business Name,"businessName")
'Business Name,LLC'
>>> p.parseBusinessContactInformation("Business Name,"businessDba")
'Business DBA'
>>> p.parseBusinessContactInformation("Business Name,"businessEmail") is None
True
>>> p.parseBusinessContactInformation("Business Name,"?")
'dataNotAvailableMessage'
>>> p.parseBusinessContactInformation("Business Name,"businessFoo")
'Bar'
"""
d = self._parseBusinessContactInformation(rowdata)
return d.get(element,self.dataNotAvailableMessage)
def _parseBusinessContactInformation(self,rowdata):
d = {
"businessDba": None,"businessPhone": None,"businessWebsite": None
}
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
d["businessName"] = contactData[0].strip()
if 1 < len(contactData):
if "-" in contactData[1]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
for i in range(1,len(contactData)):
contactTemp = contactData[i].split('-')
if len(contactTemp) > 1:
d["business" + contactTemp[0].strip()] = contactTemp[1].strip()
return d
最后一点:切换到蛇形盒,创建一个get
和一个parse
函数:parse
返回一个字典,而get
返回一个值:
data_not_available_message = "dataNotAvailableMessage"
def get_business_contact_information(self,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.get_business_contact_information("Business Name,"businessPhone")
'1234567890'
>>> p.get_business_contact_information("Business Name,LLC'
>>> p.get_business_contact_information("Business Name,"businessDba")
'Business DBA'
>>> p.get_business_contact_information("Business Name,"businessEmail") is None
True
>>> p.get_business_contact_information("Business Name,"?")
'dataNotAvailableMessage'
>>> p.get_business_contact_information("Business Name,"businessFoo")
'Bar'
:param rowdata: ...
:param element: ...
:return: ...
"""
d = self._parse_business_contact_information(rowdata)
return d.get(element,self.data_not_available_message)
进行了一些外观更改,使其更具有Pythonic:
def parse_business_contact_information(self,rowdata):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parse_business_contact_information("Business Name,LLC : Business DBA : Email- [email protected] : Phone- 1234567890 : Website- www.site.com") == {
... 'businessDba': 'Business DBA','businessPhone': '1234567890','businessEmail': '[email protected]',... 'businessWebsite': 'www.site.com','businessName': 'Business Name,LLC'}
True
>>> p.parse_business_contact_information("Business Name,LLC : Phone- 1234567890") == {
... 'businessDba': 'Business Name,LLC','businessEmail': None,... 'businessWebsite': None,LLC'}
True
:param rowdata: ...
:return: ...
"""
d = dict.fromkeys(("businessDba","businessPhone","businessEmail","businessWebsite"))
name,*others = rowdata.split(':') # destructuring assignment
d["businessName"] = name.strip()
if not others:
return d
if "-" in others[0]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = others[0].strip()
others.pop(0) # consume others[0]
for data in others:
try:
key,value = data.split('-',1) # a- b-c => a,b-c
except ValueError: # too many/not enough values to unpack
print("Element {} should have a dash".format(data))
else:
d["business" + key.strip()] = value.strip()
return d
代码并不完美,但至少在我看来比以前更清晰。
总结方法:
- 编写单元测试以保护行为;
- 进行一些小的转换以保留行为 并提高可读性。将您可以做的事情分解为因素,而不在这里关注性能;
- 继续进行,直到您清楚一些为止/绕圈转圈并进行不必要的修改时停止;
- 如有必要,提高性能。