Python html 模块,parser() 实例源码
我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用html.parser()。
def unescape_html(html_):
"""
Replace HTML entities (e.g. `£`) in a string.
:param html_: The escaped HTML.
:return: The input string with entities replaces.
"""
# http://stackoverflow.com/a/2360639
if sys.version_info.major == 2: # 2.7
# noinspection PyUnresolvedReferences,pycompatibility
from HTMLParser import HTMLParser
return HTMLParser().unescape(html_)
if sys.version_info.minor == 3: # 3.3
# noinspection pycompatibility
from html.parser import HTMLParser
# noinspection PyDeprecation
return HTMLParser().unescape(html_)
# 3.4+
# noinspection pycompatibility
import html
return html.unescape(html_)
def processIncomingTweet(tweet): #check tweet that has come in via the filter stream,it might have commands in it
# print(tweet)
global maxWordQ
global wordq
if scanTags(tweet,"NixieBotShowMe") :
theWord=extractWord(html.parser.HTMLParser().unescape(tweet['text']))
if ((theWord is not None ) or ( hasCommand(tweet))) :
wordqPut(tweet,priority = prioritise(tweet))
size = wordq.qsize()
if size > maxWordQ : maxWordQ = size
print("word request from", tweet['user']['screen_name'], "word = ", theWord, " Word queue at:", size, "maxqueue was ", maxWordQ)
recentReqs.append(tweet) # store for sending to hard storage every Now and then
if len(recentReqs) > reqPickleFrequency :
if pickleMe(recentReqs, "Requests", dateStamp=True) :
recentReqs[:]=[]
#userCounter.update(tweet['user']['screen_name'])
# DMreceipt bad idea as it still counts against rate limit
#for ht in tweet['entities']['hashtags']:
# if ht['text']=="NBreceipt" and not rct:
# sendReceipt(tweet,theWord,tt)
# rct=True
def html2tele(html):
#print("html2tele input: ",html)
parser = _HTMLToText()
parser.Feed(html)
parser.close()
result = parser.get_text()
result = re.sub(r'\n(\s*\n+)', '\n\n', result)
result = re.sub(r' +<pre>', '<pre>', result)
result = re.sub(r'</pre> +', '</pre>', result)
#print("html2tele result: ",result)
return result
#----------
def on_success(self, tweet):
global recentIDDeque
if 'text' in tweet and not ('retweeted_status' in tweet) :
print("<<<<<<<<<<<<<<<<<<< Incoming!<<<<<<<<<<<<<<<<<< " + html.parser.HTMLParser().unescape(tweet['text']) + tweet['id_str'])
if tweet['id_str'] not in recentIDDeque :
processIncomingTweet(tweet)
recentIDDeque.appendleft(tweet['id_str'])
else :
print("!!!! duplicate! Ignored ")
backOffTime = 60
def processIncomingTweet(tweet): #check tweet that has come in via the filter stream,it might have commands in it
# print(tweet)
global botState
global wordq
global randstream
if scanTags(tweet,priority = prioritise(tweet))
size = wordq.qsize()
if size > botState['maxWordQ'] : botState['maxWordQ'] = size
print("word request from", botState['maxWordQ'])
recentReqs.append(tweet) # store for sending to hard storage every Now and then
if len(recentReqs) > reqPickleFrequency :
if pickleMe(recentReqs, dateStamp=True) :
recentReqs[:]=[]
#userCounter.update(tweet['user']['screen_name'])
elif scanTags(tweet,"NixieBotRollMe") :
rollq.put(tweet)
print("roll request incoming! Word queue at:", rollq.qsize())
else :
#must be a trump tweet so submit to random for Now
randstream.on_success(tweet)
# DMreceipt bad idea as it still counts against rate limit
#for ht in tweet['entities']['hashtags']:
# if ht['text']=="NBreceipt" and not rct:
# sendReceipt(tweet,tt)
# rct=True
def on_success(self, tweet):
if 'text' in tweet and not ('retweeted_status' in tweet) :
print("<<<<<<<<<<<<<<<<<<< Incoming!<<<<<<<<<<<<<<<<<< " + html.parser.HTMLParser().unescape(tweet['text']))
processIncomingTweet(tweet)
backOffTime = 60
def test_html_import(self):
import html
import html.entities
import html.parser
self.assertTrue(True)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--toc-maker", help="path to ToC making tool")
parser.add_argument("--twitter-poster", default="t update", help="twitter poster command")
parser.add_argument("-t", "--use-twitter", action="store_true")
kNown_args, unkNown_args = parser.parse_kNown_args()
if not kNown_args.toc_maker:
kNown_args.toc_maker = "./gh-md-toc"
if not os.path.isfile(kNown_args.toc_maker):
s = cmd.getoutput("uname -s").lower()
f = "gh-md-toc.%s.amd64.tgz" % s
URL = "https://github.com/ekalinin/github-markdown-toc.go/releases/download/0.6.0/%s" % f
if not os.path.isfile(f):
if cmd.getstatusoutput("wget %s" % URL)[0] != 0:
raise EnvironmentError("Cannot download toc maker from URL: %s" % URL)
if cmd.getstatusoutput("tar xzf %s" % f)[0] != 0:
raise EnvironmentError("Cannot untar toc maker from file %s" % f)
os.remove(f)
current_permissions = stat.S_IMODE(os.lstat(kNown_args.toc_maker).st_mode)
os.chmod(kNown_args.toc_maker, current_permissions & stat.S_IXUSR)
if unkNown_args:
filepath = unkNown_args[0]
else:
print("You should specify the path for file to work with!")
quit(1)
return kNown_args, filepath
def test_with_deleted_parent(self):
# see #18681
from html import parser
html = sys.modules.pop('html')
def cleanup():
sys.modules['html'] = html
self.addCleanup(cleanup)
with self.assertRaisesRegex(ImportError, 'html'):
imp.reload(parser)
def parse(html):
'''Esegue il parsing HTML del testo html e
ritorna la radice dell'albero.'''
parser = _MyHTMLParser()
parser.Feed(html)
return parser.root
def test_with_deleted_parent(self):
# see #18681
from html import parser
html = sys.modules.pop('html')
def cleanup():
sys.modules['html'] = html
self.addCleanup(cleanup)
with self.assertRaisesRegex(ImportError, 'html'):
imp.reload(parser)
def test_with_deleted_parent(self):
# see #18681
from html import parser
html = sys.modules.pop('html')
def cleanup():
sys.modules['html'] = html
self.addCleanup(cleanup)
with self.assertRaisesRegex(ImportError, 'html'):
imp.reload(parser)
def loadUserFont(fontfile) :
#load in font file generated from online font designer at http://b7971.lucsmall.com/
#lines should look like: 0x7622,// 0 - A
#and the bit order should be reversed using the button at the top of that page
global comlock
global userProperChars
font = {}
stashfx = effx
stashspeed = fxspeed
setEffex(0,0)
userProperChars = ""
print("loading font")
with open(fontfile) as ff :
for line in ff :
if line == '\n' : continue # cope with blank at end of file
parts = line.split(",")
print("parts = ",parts)
bits = parts[0]
letter = parts[1].split("-")[1].strip()
bitval = int(bits,16)
print(bitval,letter)
font[letter] = bitval
font['-'] = 0x0022 #nasty hack as hyphen entry is broken by the split("-")
font[','] = 0x0004 # ditto for comma
font['~'] = 0x1310 # and tilde
print(len(font)," characters loaded,Now sending")
with comlock :
print("loadfont got comlock")
cmd = "$B7F" + "U" * tubes
print(cmd)
com.write(bytes(cmd+"\r","utf-8"))
for glyph in font:
userProperChars = userProperChars + glyph
cmd="$B7W"+glyph
mask =int('0b0100000000000000',2)
while mask > 0 :
if int(font[glyph]) & int(mask) > 0 :
cmd = cmd + "1"
else :
cmd = cmd + "0"
mask = mask >> 1
print(cmd)
com.write(bytes(cmd+"\r","utf-8"))
time.sleep(0.3)
cmd="$B7M"+ glyph * tubes
print(cmd)
com.write(bytes(cmd+"\r","utf-8"))
# special case (ok,bodge!) for space as the strip command in the font file parser above will remove it,and all fonts need a space
cmd="$B7W 000000000000000"
print(cmd)
com.write(bytes(cmd+"\r","utf-8"))
cmd="$B7M "
print(cmd)
com.write(bytes(cmd+"\r","utf-8"))
userProperChars = userProperChars + " "
setEffex(stashfx,stashspeed)
# Now write out character set file ( used by proper() )
with open("uCharSet.txt",'w' ) as cf :
cf.write(userProperChars)
print("loadfont rel comlock")
def test_future_moves(self):
"""
Ensure everything is available from the future.moves interface that we
claim and expect. (Issue #104).
"""
from future.moves.collections import Counter, OrderedDict # backported to Py2.6
from future.moves.collections import UserDict, UserList, UserString
from future.moves import configparser
from future.moves import copyreg
from future.moves.itertools import filterfalse, zip_longest
from future.moves import html
import future.moves.html.entities
import future.moves.html.parser
from future.moves import http
import future.moves.http.client
import future.moves.http.cookies
import future.moves.http.cookiejar
import future.moves.http.server
from future.moves import queue
from future.moves import socketserver
from future.moves.subprocess import check_output # even on Py2.6
from future.moves.subprocess import getoutput, getstatusoutput
from future.moves.sys import intern
from future.moves import urllib
import future.moves.urllib.error
import future.moves.urllib.parse
import future.moves.urllib.request
import future.moves.urllib.response
import future.moves.urllib.robotparser
try:
# Is _winreg available on Py2? If so,ensure future.moves._winreg is available too:
import _winreg
except ImportError:
pass
else:
from future.moves import winreg
from future.moves import xmlrpc
import future.moves.xmlrpc.client
import future.moves.xmlrpc.server
from future.moves import _dummy_thread
from future.moves import _markupbase
from future.moves import _thread