Python lxml.etree 模块,HTML 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用lxml.etree.HTML。
def QA_fetch_get_stock_block():
url_list = ['gn', 'dy', 'thshy', 'zjhhy'] # ??/??/?????/?????
data = []
for item in url_list:
tree = etree.HTML(requests.get(
'http://q.10jqka.com.cn/{}/'.format(item), headers=headers).text)
gn = tree.xpath('/html/body/div/div/div/div/div/a/text()')
gpath = tree.xpath('/html/body/div/div/div/div/div/a/@href')
for _i in range(len(gn)):
for i in range(1, 15):
_data = etree.HTML(requests.get(
'http://q.10jqka.com.cn/{}/detail/order/desc/page/{}/ajax/1/code/{}'.format(item, i, gpath[_i].split('/')[-2]), headers=headers).text)
name = _data.xpath('/html/body/table/tbody/tr/td[3]/a/text()')
code = _data.xpath('/html/body/table/tbody/tr/td[3]/a/@href')
for i_ in range(len(name)):
print(
'Now Crawling-{}-{}-{}-{}'.format(gn[_i], code[i_].split('/')[-1], item, 'ths'))
data.append([gn[_i], 'ths'])
return pd.DataFrame(data, columns=['blockname', 'code', 'type', 'source']).set_index('code', drop=False)
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
elements = []
for i in range(num_elements):
choice = random.randint(0,3)
if choice == 0:
# New tag.
tag_name = random.choice(tag_names)
elements.append("<%s>" % tag_name)
elif choice == 1:
elements.append(rsentence(random.randint(1,4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def _get_html(cls, html, url, html_etree, params, **kwargs):
if html:
html = etree.HTML(html)
elif url:
if not kwargs.get('headers', None):
kwargs['headers'] = {
"User-Agent": get_random_user_agent()
}
response = requests.get(url, **kwargs)
response.raise_for_status()
content = response.content
charset = cchardet.detect(content)
text = content.decode(charset['encoding'])
html = etree.HTML(text)
elif html_etree is not None:
return html_etree
else:
raise ValueError("html(url or html_etree) is expected")
return html
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def parse_page(page,pattern):
page = etree.HTML(page.lower())
#page = etree.HTML(page.lower().decode('utf-8'))
ips = page.xpath(pattern["ip"])
ports = page.xpath(pattern["port"])
ty = page.xpath(pattern["type"])
for i in range(len(ips)):
ret = {}
str = "%s:%s"
ret["ip_port"] = str%(ips[i].text,ports[i].text)
if ty[i].text.find("https") == -1:
ret["type"] = 0
else:
ret["type"] = 1
ret["db_flag"] = False
yield ret
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def get_list(self, search_url):
data = {}
# keylist = [0] * 5
data['table_name'] = 'dailyKeyword'
html = requests.get(search_url, headers=self.headers, verify=False).content
selector = etree.HTML(html)
# ????
keyurl = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/@href')
keyword = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/text()')
res = {}
res['keyurl'] = keyurl
res['keyword'] = keyword
for x in range(0,10):
data['keyword'] = keyword[x]
data ['keyurl'] = keyurl[x]
data ['id'] = (x+1)
self.save(data)
return res
# ??????
def __init__(self,data=None,response=None,url=None,logFile=None,color=True,debug=4):
'''
:param data: default=None <class str|unicode response.text>
:param response: default=None <class Response>
:param url: default=None <class str>
:param logFile: default=None <class str>
:param color: default=True <class bool>
:param debug: default=4 <class int|0 NONE,1 [Error],2 [Error][WARING],3 [Error][WARING][INFO],4 ALL>
'''
self.logFile = logFile
self.color = color
self.debug = debug
self.data = data
self.response = response
try:
self.url = response.request.url if response and not url else url
self._html = etree.HTML(self.data) if data else None
except Exception as e:
printText("[Error]parser.py Parser __init__:%s"%e,logFile=self.logFile,color=self.color,debug=self.debug)
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def get_type_id():
start_url_list = [
'http://www.autohome.com.cn/a00/', # ???
'http://www.autohome.com.cn/a0/', # ???
'http://www.autohome.com.cn/a/', # ????
'http://www.autohome.com.cn/b/', # ???
'http://www.autohome.com.cn/c/', # ????
'http://www.autohome.com.cn/d/', # ???
'http://www.autohome.com.cn/suv/', # SUV
'http://www.autohome.com.cn/mpv/', # MPV
'http://www.autohome.com.cn/s/', # ??
'http://www.autohome.com.cn/p/', # ??
'http://www.autohome.com.cn/mb/', # ??
]
models_list = []
for url_t in start_url_list:
model_resp = process_request(url_t)
model_respose = etree.HTML(model_resp)
models = model_respose.xpath('.//a/@data-value')
models_list = models_list + models
models_list = list(set(models_list))
return models_list
def set_nasa_wallpaper():
st = datetime.fromtimestamp(time.time()).strftime('%y%m%d')
url = URL07.format(st)
r = requests.get(url)
if r.status_code == 200:
try:
parser = etree.HTMLParser(recover=True)
html = etree.HTML(r.content, parser)
images = html.iter('img')
if images is not None:
images = list(images)
if len(images) > 0:
image_url = images[0].getparent().attrib['href']
image_url = 'https://apod.nasa.gov/' + image_url
if download(image_url) is True:
set_background(comun.POTD)
except Exception as e:
print(e)
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def show_body():
# with open('lianjia_body.txt','r') as fp:
with open('cq_error.txt', 'r') as fp:
content = json.loads(fp.read())['body']
# print content
tree = etree.HTML(content)
nodes = tree.xpath('//li[@class="pictext"]')
for node in nodes:
xiaoqu_url = node.xpath('.//a[@class="flexBox post_ulog"]/@href')[0]
name = node.xpath('.//div[@class="item_list"]/div[@class="item_main"]/text()')[0]
desc = node.xpath('.//div[@class="item_list"]/div[@class="item_other text_cut"]/text()')[0]
details = desc.split()
price = node.xpath('.//div[@class="item_list"]/div[@class="item_minor"]/span/em/text()')[0]
print xiaoqu_url
print name
print len(details)
# print details
for i in details:
print i
print
# print details[0],details[1],details[2]
# print price
def get_city_link():
headers = {'Host': 'm.lianjia.com',
'User-Agent': 'UCWEB/2.0 (Linux; U; Adr 2.3; zh-CN; MI-ONEPlus) U2/1.0.0 UCbrowser/8.6.0.199 U2/1.0.0 Mobile'}
url = 'https://m.lianjia.com/city/'
r = requests.get(url=url, headers=headers)
contnet = r.text
# print contnet
tree = etree.HTML(contnet)
t1 = tree.xpath('//ul[@class="item_lists"]')[1]
city_list = []
for city in t1:
link = city.xpath('.//a/@href')[0]
if link == '/sh/':
continue
if link == '/su/':
continue
if link == '/xsbn/':
continue
city_list.append('https://m.lianjia.com' + link)
return city_list
def debug_page():
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0'
}
url = 'http://m.qfang.com/guangzhou/rent/100001468?gardenId=1109818'
r = requests.get(url=url, headers=headers)
#r.encoding='gbk'
print r.status_code
print type(r.content)
print r.content
#print chardet.detect(r)
tree = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8'))
#print etree.tostring(tree)
return tree,r.text
# ????????header??
def testcase2():
js=json.loads(open('lianjia_sh.txt').read())
#print js
body=js['data']
tree = etree.HTML(body)
nodes = tree.xpath('//li[@class="pictext"]')
print "NODE:",len(nodes)
print js['args']
print '*'*20
print type(js)
print type(js['args'])
#p=re.compile('"cur_city_name":"(.*?)"')
p=re.compile('"total":(\d+)')
s=p.findall(js['args'])[0]
print s
'''
print type(s)
print s
print s.decode('utf-8').encode('gbk')
print s.decode('unicode_escape')
for k,v in js['args'].items():
print k,"::",v
'''
def lxml_case2():
#?????
str1='''
<bookstore>
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
</bookstore>
'''
tree=etree.HTML(str1)
t1=tree.xpath('bookstore')
print t1
def getData(self):
base_url='http://sj.qq.com/myapp/category.htm'
parent_url='http://sj.qq.com/myapp/category.htm?orgame=1'
s=requests.get(url=parent_url,headers=self.headers)
print s.status_code
#print s.text
tree=etree.HTML(s.text)
menu=tree.xpath('//ul[@class="menu-junior"]')[0]
print type(menu)
link= menu.xpath('.//li[@id]/a/@href')
catelog=[]
for i in link:
print i
p=re.compile('categoryId=(-?\d+)')
#x=base_url+i
x=p.findall(i)[0]
#print x
catelog.append(x)
return catelog
def get_list(self, cookies):
print("?????%s???\r\n" % self.page)
page_r = requests.get(self.targetUrl + "&page=%s" % self.page, cookies=cookies)
if page_r.status_code == 200:
if 'window.v=' in page_r.text:
return 10001
tree = etree.HTML(page_r.text)
init_list = tree.xpath('//*[@id="ht-kb"]/article/h3/a')
list_array = []
for item in init_list:
item_link = item.get('href')
item_text = item.text
item_array = [item_text,item_link]
list_array.append(item_array)
return list_array
else:
print("???????5??????\r\n")
time.sleep(5)
return self.get_list()
def get_proxys(pages=4):
"""????"""
proxy_list = []
url = 'http://www.xicidaili.com/wn/'
headers = generate_http_header()
headers.update(
{
'Referer': 'http://www.xicidaili.com/wn/',
'Host': 'www.xicidaili.com',
}
)
for page_no in range(1, pages + 1):
response = requests.get(url=url.format(page_no=page_no), headers=headers)
html = etree.HTML(response.text)
ips = html.xpath("//table[@id='ip_list']/tr/td[2]/text()")
ports = html.xpath("//table[@id='ip_list']/tr/td[3]/text()")
assert len(ips) == len(ports)
for (ip, port) in zip(ips, ports):
proxy_list.append(constants.HTTP_PROXY_FORMATTER.format(ip=ip, port=port))
return proxy_list
def requests_company_detail_data(company_id):
"""?????????"""
headers = generate_http_header()
crawler_sleep()
try:
response = requests.get(
url=constants.COMPANY_DETAIL_URL.format(company_id=company_id),
headers=headers,
cookies=Cookies.get_random_cookies(),
allow_redirects=False,
timeout=constants.TIMEOUT)
except RequestException as e:
logging.error(e)
raise RequestsError(error_log=e)
html = etree.HTML(response.text)
advantage = html.xpath('//div[@id="tags_container"]//li/text()')
size = html.xpath('//div[@id="basic_container"]//li[3]/span/text()')
address = html.xpath('//p[@class="mlist_li_desc"]/text()')
introduce = html.xpath('//span[@class="company_content"]//text()')
return format_tag(advantage, address, size, introduce, company_id)
def requests_job_detail_data(job_id):
"""?????????"""
headers = generate_http_header()
crawler_sleep()
try:
response = requests.get(
url=constants.JOB_DETAIL_URL.format(job_id=job_id),
timeout=constants.TIMEOUT)
except RequestException as e:
logging.error(e)
raise RequestsError(error_log=e)
html = etree.HTML(response.text)
department = html.xpath('//div[@class="job-name"]/div[@class="company"]/text()')
description = html.xpath('//dd[@class="job_bt"]/div//text()')
keywords = html.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()')
return format_tag(department, description, keywords, job_id)
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def index(url='http://music.163.com/discover'):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Host': 'music.163.com',
'Referer': 'http://music.163.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/58.0.3029.110 Safari/537.36 DOL/s_1511_r2x9ak474125_821',
}
try:
r = requests.get(url, headers=headers, timeout=4)
html = etree.HTML(r.content)
play_lists = [urlparse.urljoin('http://music.163.com/', link) for link in
html.xpath('//*[@id="discover-module"]/div[1]/div/div/div[1]/ul//li/div/a/@href') if
link.startswith('/playlist')]
for url in play_lists:
app.send_task(
'tasks.playlist.playlist',
args=(url, ),
queue='playlist_queue',
routing_key='tasks_playlist'
)
except:
print '????'
def playlist(url):
headers = {
'Accept': 'text/html, headers=headers)
if r.status_code == 200:
html = etree.HTML(r.content)
ids = [search(link).group() for link in html.xpath('//a/@href') if link.startswith('/song?id') and search(link)]
for song_id in ids:
url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token='.format(song_id)
app.send_task(
'tasks.comment.comment',
args=(url, song_id),
queue='comment_queue',
routing_key='tasks_comment'
)
time.sleep(5)
except:
print '????'
def parse(self, response):
# ??response yield link
if not response:
return None
et = etree.HTML(response)
links = et.xpath("//*[@valign='top'][1]/a/@href")
urls=[]
for link in links:
#??id?????????
print link
uid=re.findall(r"http://weibo\.cn/u/(\w*)", link) #??????????eq:http://weibo.cn/renzhenghao)
if uid:
uid=uid[0]
else:
continue
SinaWeiboItem["uid"]=uid
info_url = "http://weibo.cn/{uid}/info".format(uid=uid)
Request(info_url, callback=self.parse_info)
datas={"uid":SinaWeiboItem["uid"],"name":SinaWeiboItem["name"],"info":SinaWeiboItem["info"]}
print sina_info.insert(datas)
urls.append("http://weibo.cn/{uid}/fans".format(uid=uid)) #url????????
return urls
def media_by_tag(browser, tag_url, media_url, tag, media_max_likes, media_min_likes):
# returns list with the 14 'nodes' (posts) for the tag page
result = {'posts': False, 'tag': tag}
try:
explore_site = browser.get(tag_url %(tag))
tree = etree.HTML(explore_site.text)
data = return_sharedData(tree)
if data:
nodes = data['entry_data']['TagPage'][0]['tag']['media']['nodes']
result['posts'] = [{'user_id': n['owner']['id'],
'username': return_username(browser, n['code']),
'likes': n['likes']['count'],
'caption': n['caption'],
'media_id': n['id'],
'url_code': n['code']}
for n in nodes if media_min_likes <= n['likes']['count'] <= media_max_likes if not n['comments_disabled']]
except Exception as e:
print '\nError in obtaining media by tag: %s' %(e)
return result
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def parse_main_subjects(self, content):
'''
?????????????????
:param content: ???????
:return: ['?????????','?????????']
'''
try:
html = etree.HTML(content.lower())
subject = html.xpath('//ul[@class="img"]/li')
subject_urls = list()
for sub in subject:
a_href = sub[0].get('href')
subject_urls.append(a_href)
return subject_urls
except Exception as e:
print(str(e))
return list()
def replace_InvalidTag(Html):
'''
??HTML??????
'''
re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) # ??CDATA
Html = re_cdata.sub('', Html)
re_cdata = re.compile('<!\[CDATA\[[^>]*//\]\]>', Html)
re_br = re.compile('<br\s*?/?>') # ????
Html = re_br.sub('\n', Html)
space_line = re.compile('\s+') # ???????
Html = space_line.sub('', Html)
re_comment = re.compile('<!--[^>]*-->') # ??HTML??
Html = re_comment.sub('', Html)
re_style = re.compile('<style\s*[^>]*>(.*?)</style\s*>')
Html = re_style.sub('', Html)
re_script = re.compile('<script\s*[^>]*>(.*?)</script>')
Html = re_script.sub('', Html)
re_h = re.compile('</?[^>]*>') # ??html??
Html = re_h.sub('', Html)
return Html
def replace_CharEntity(Html):
'''
????HTML????,?????????HTML????????
'''
CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"', '34': '"', }
re_charEntity = re.compile(r'&#?(?P<name>\w+);')
sz = re_charEntity.search(Html)
while sz:
key = sz.group('name') # ??&?d?entity,?>?gt
try:
Html = re_charEntity.sub(CHAR_ENTITIES[key], Html, 1)
sz = re_charEntity.search(Html)
except KeyError:
# ?????
Html = re_charEntity.sub('', 1)
sz = re_charEntity.search(Html)
return Html
def extract_Meta(html):
'''
????Meta???????????
'''
if chardet.detect(html)['encoding'] == 'utf-8':
html = html.decode('utf-8')
Meta_list = []
# ??html?Meta???
page = etree.HTML(html.lower())
xpath_result = page.xpath(u"//Meta/@content")
for once_xpath_result in xpath_result:
# ???????????
if zh_check(once_xpath_result) == True:
Meta_list.append(utf8_transfer(once_xpath_result).decode('utf-8'))
if Meta_list != []:
return Meta_list
else:
return False
def validProxy(self):
"""
url: http://www.66ip.cn/
"""
url = 'http://www.66ip.cn/areaindex_1/1.html'
response = requests.get(url=url, headers=self.headers)
htmlDoc = response.content.decode('gbk')
htmlTree = etree.HTML(htmlDoc)
proxy_list = htmlTree.xpath('.//table//tr')
for proxy in proxy_list:
proxies = ':'.join(proxy.xpath('./td/text()')[0:2])
if self.__verifyProxy(proxies):
if self.__isVaildProxy(proxies):
return {
"https": "https://{proxy}".format(proxy = proxies)
}
return None
# ??IP??????
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def get_xml_data(req_string, headers, data=None):
req = urllib2.Request(req_string, headers=headers)
html_data = _get_html_data(req, data)
# Clean chunked data
html_data = clean_chunked_data(html_data)
#log_user_action(req.get_host(),'chunked data',html_data,{})
try:
data = etree.fromstring(html_data)
except XMLSyntaxError:
# lxml cannot handle encoding declarations :(
data = etree.HTML(html_data, etree.HTMLParser())
# data is None when it was not XML,like 404 page without 404 code
if data is not None:
data = data.getroottree()
else:
raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None)
# Todo: check valid
#if not data.find('.//prestashop'):
# raise urllib2.HTTPError(req_string,404,"Not an XML",None,None)
return data
def MakePoem(word):
url_base = "http://so.gushiwen.org/search.aspx?value="
key = word
url = url_base+key
res = requests.get(url)
res.encoding = 'utf-8'
#print(res.text)
root = etree.HTML(res.content)
items = root.xpath('//div[@class="sons"][2]/p[@style="margin-bottom:0px;"]')[0]
item = items.xpath('string(.)')
content = item.replace('\n','').replace(' ','')
length = len(content)
answer = content[:length-1]
return answer
#print(content)
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def parse_home(self, home_content):
if home_content is None:
return None
home_content = home_content.encode('ISO-8859-1').decode('gbk')
html = etree.HTML(home_content, parser=etree.HTMLParser(encoding='utf-8'))
alinks = html.xpath('//a[@href]')
pattern_capture = re.compile(ur"?(\d{6})?(.+)")
l = []
for alink in alinks:
aa = alink.text
if aa != None:
match = pattern_capture.match(aa)
if match:
#????,???????
# l.append((match.group(1),match.group(2)))
l.append(match.group(1))
return l
#?????????????,???????,????dict?,????????,?????????????
def parse_ratio(self, info, content):
# content = content.split('"')[1]
html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
tds = html.xpath('//td[@class="tor"]')
if len(tds) > 2:
#??????,?????---?
#???????????????????????????????????http://fund.eastmoney.com/f10/cyrjg_510090.html?????>???????????????+??=100%????<=????????????
insito = tds[0].text
if insito != '---':
info.inratio += safe_to_float(insito.split("%")[0])
# innerto = tds[2].text
# if innerto != '---':
# self.inratio += safe_to_float(innerto.split("%")[0])
# self.inratio = safe_to_float(.split('%')[0]) + safe_to_float(tds[2].text.split('%')[0])
#?????????,????????
def parse_stocks(self, content):
html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
#????????????,?????????????
tbs = html.xpath('//table[@class="w782 comm tzxq"]')
# pers = html.xpath('//table[@class="w782 comm tzxq"]')
if len(tbs) > 0:
#???????,?????
stocktds = tbs[0].xpath('.//td[@class="tol"]/a')
pers = tbs[0].xpath('.//td[@class="tor"]')
# ???????????,????,?????5???
front, interval = 2, 5
if not '???' in content:
front, interval = 0, 3
for (index, stocked) in enumerate(stocktds):
# info.stocks.append(stocked.text)
# tor????,?????????
per = pers[index*interval+front]
# ???????? "???????????????????????????????" ????????
if per == '---':
continue
# ?????????,??[????-3.6%,????-4.1%]?????
# ????????bug,?????,??????
stockname = stocked.text
if not stockname is None and len(stockname) > 0:
info.stocks.append(stockname + '-' + per.text)
def parse_index_list(self, index_list_content):
# ????????
index_list_content = index_list_content.encode('ISO-8859-1').decode('utf-8')
parsed_content = etree.HTML(index_list_content, parser=etree.HTMLParser(encoding='utf-8'))
trs = parsed_content.xpath('//tbody/tr')
indexs = []
for tr in trs:
tds = tr.xpath('./td')
if len(tds) == 5:
index = IndexInfo()
code = tds[0].text.strip()
if len(code.split('.')) == 2:
index.code = code.split('.')[0]
index.full_code = code
index.name = tds[1].text.strip()
index.begin_time = tds[2].text.strip()
index.short_name = tds[3].text.strip()
#????url,????????
weave = tds[4].xpath('./a')
if len(weave) == 1:
index.weave = weave[0].attrib['href'].strip()
else:
index.weave = tds[4].text.strip()
indexs.append(index)
return indexs
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def getHtmlTree(url, **kwargs):
"""
??html?
:param url:
:param kwargs:
:return:
"""
header = {'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML,like Gecko)',
'Accept': 'text/html,
'Accept-Encoding': 'gzip,
'Accept-Language': 'zh-CN,zh;q=0.8',
}
# Todo ??????????????
wr = WebRequest()
# delay 2s for per request
time.sleep(2)
html = wr.get(url=url, header=header).content
return etree.HTML(html)
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p',4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def get_html_tree(url, headers=None, cookie=None, proxy=None):
if headers is None:
headers = HEADERS
try:
response = requests.get(url=url, cookies=cookie, timeout=10, proxies=proxy)
response.raise_for_status()
response.encoding = response.apparent_encoding
html = response.text
if isinstance(html, bytes):
html = html.decode("utf-8")
time.sleep(1)
return etree.HTML(html)
except Exception as e:
log.error("{0}".format(e))
raise e
def WriteHTML(self,testcaseinfo):
self.CreateHtmlFile()
f = open(self.reportfile,"r")
htmlcontent = f.read()
f.close()
#tree = mytree.fromstring(str(htmlcontent))
htmlcontent.encode('utf-8')
tree = html.fromstring(htmlcontent)
tableElem = tree.find(".//table")
if testcaseinfo.result == "Failed":
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
else:
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.errorinfo)
tableElem.append(mytree.HTML(str(mytablerow)))
f = open(self.reportfile,"w")
#html.tostring
newContent = repr(html.tostring(tree,method="html",with_tail=False))
newContent = newContent.replace(r"\n","").replace(r"\t","").replace('b\'',"")
newContent = newContent[:len(newContent)-1]
f.write(newContent)
f.close()