Python lxml.etree 模块，HTML 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用lxml.etree.HTML。

项目：QUANTAXIS 作者：yutiansut | 项目源码 | 文件源码

def QA_fetch_get_stock_block():
    url_list = ['gn', 'dy', 'thshy', 'zjhhy']  # ??/??/?????/?????
    data = []

    for item in url_list:
        tree = etree.HTML(requests.get(
            'http://q.10jqka.com.cn/{}/'.format(item), headers=headers).text)
        gn = tree.xpath('/html/body/div/div/div/div/div/a/text()')
        gpath = tree.xpath('/html/body/div/div/div/div/div/a/@href')
        for _i in range(len(gn)):
            for i in range(1, 15):
                _data = etree.HTML(requests.get(
                    'http://q.10jqka.com.cn/{}/detail/order/desc/page/{}/ajax/1/code/{}'.format(item, i, gpath[_i].split('/')[-2]), headers=headers).text)
                name = _data.xpath('/html/body/table/tbody/tr/td[3]/a/text()')
                code = _data.xpath('/html/body/table/tbody/tr/td[3]/a/@href')
                for i_ in range(len(name)):
                    print(
                        'Now Crawling-{}-{}-{}-{}'.format(gn[_i], code[i_].split('/')[-1], item, 'ths'))
                    data.append([gn[_i], 'ths'])

    return pd.DataFrame(data, columns=['blockname',  'code', 'type', 'source']).set_index('code', drop=False)

项目：Projects 作者：it2school | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

项目：talonspider 作者：howie6879 | 项目源码 | 文件源码

def _get_html(cls, html, url, html_etree, params, **kwargs):
        if html:
            html = etree.HTML(html)
        elif url:
            if not kwargs.get('headers', None):
                kwargs['headers'] = {
                    "User-Agent": get_random_user_agent()
                }
            response = requests.get(url, **kwargs)
            response.raise_for_status()
            content = response.content
            charset = cchardet.detect(content)
            text = content.decode(charset['encoding'])
            html = etree.HTML(text)
        elif html_etree is not None:
            return html_etree
        else:
            raise ValueError("html(url or html_etree) is expected")
        return html

项目：TACTIC-Handler 作者：listyque | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p',4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p',4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

项目：IPProxy 作者：yutian2011 | 项目源码 | 文件源码

def parse_page(page,pattern):
    page = etree.HTML(page.lower()) 
    #page = etree.HTML(page.lower().decode('utf-8')) 
    ips = page.xpath(pattern["ip"])
    ports = page.xpath(pattern["port"])
    ty = page.xpath(pattern["type"])
    for i in range(len(ips)):
        ret = {}
        str = "%s:%s"
        ret["ip_port"] = str%(ips[i].text,ports[i].text)
        if ty[i].text.find("https") == -1:
            ret["type"] = 0
        else:
            ret["type"] = 1
        ret["db_flag"] = False
        yield ret

项目：llk 作者：Tycx2ry | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p',4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

项目：catchWecaht 作者：leon0204 | 项目源码 | 文件源码

def get_list(self, search_url):
        data = {}
        # keylist =  [0] * 5
        data['table_name'] = 'dailyKeyword'
        html = requests.get(search_url, headers=self.headers, verify=False).content
        selector = etree.HTML(html)

        # ????
        keyurl = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/@href')
        keyword = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/text()')
        res = {}
        res['keyurl'] = keyurl
        res['keyword'] = keyword

        for x in range(0,10):
            data['keyword'] = keyword[x]
            data ['keyurl'] = keyurl[x]
            data ['id'] = (x+1)
            self.save(data)
        return res



    # ??????

项目：dpspider 作者：doupengs | 项目源码 | 文件源码

def __init__(self,data=None,response=None,url=None,logFile=None,color=True,debug=4):
        '''
        :param data: default=None <class str|unicode response.text>
        :param response: default=None <class Response>
        :param url: default=None <class str>
        :param logFile: default=None <class str>
        :param color: default=True <class bool>
        :param debug: default=4 <class int|0 NONE,1 [Error],2 [Error][WARING],3 [Error][WARING][INFO],4 ALL>
        '''
        self.logFile = logFile
        self.color = color
        self.debug = debug
        self.data = data
        self.response = response
        try:
            self.url = response.request.url if response and not url else url
            self._html = etree.HTML(self.data) if data else None
        except Exception as e:
            printText("[Error]parser.py Parser __init__:%s"%e,logFile=self.logFile,color=self.color,debug=self.debug)

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p',4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p',4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

项目：AutoHome_WOM_Spider 作者：dtc-auto | 项目源码 | 文件源码

def get_type_id():
    start_url_list = [
        'http://www.autohome.com.cn/a00/',  # ???
        'http://www.autohome.com.cn/a0/',  # ???
        'http://www.autohome.com.cn/a/',  # ????
        'http://www.autohome.com.cn/b/',  # ???
        'http://www.autohome.com.cn/c/',  # ????
        'http://www.autohome.com.cn/d/',  # ???
        'http://www.autohome.com.cn/suv/',  # SUV
        'http://www.autohome.com.cn/mpv/',  # MPV
        'http://www.autohome.com.cn/s/',  # ??
        'http://www.autohome.com.cn/p/',  # ??
        'http://www.autohome.com.cn/mb/',  # ??
    ]
    models_list = []
    for url_t in start_url_list:
        model_resp = process_request(url_t)
        model_respose = etree.HTML(model_resp)
        models = model_respose.xpath('.//a/@data-value')
        models_list = models_list + models
        models_list = list(set(models_list))
    return models_list

项目：national-geographic-wallpaper 作者：atareao | 项目源码 | 文件源码

def set_nasa_wallpaper():
    st = datetime.fromtimestamp(time.time()).strftime('%y%m%d')
    url = URL07.format(st)
    r = requests.get(url)
    if r.status_code == 200:
        try:
            parser = etree.HTMLParser(recover=True)
            html = etree.HTML(r.content, parser)
            images = html.iter('img')
            if images is not None:
                images = list(images)
                if len(images) > 0:
                    image_url = images[0].getparent().attrib['href']
                    image_url = 'https://apod.nasa.gov/' + image_url
                    if download(image_url) is True:
                        set_background(comun.POTD)
        except Exception as e:
            print(e)

项目：B.E.N.J.I. 作者：the-ethan-hunt | 项目源码 | 文件源码

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p',4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"