问题描述
我正在尝试构建程序来解析html页面。在这种情况下,我无法直接获取url,因此我要求用户下载html页面并使用它。
# -*- coding: UTF-8 -*-
from re import findall
from bs4 import BeautifulSoup
# INPUT
def inside(html_path,errors='ignore'):
with open(html_path,errors=errors) as fp:
soup = BeautifulSoup(fp,features='lxml')
return soup
def pairing(html_path,errors='ignore') -> dict:
use_dict = {}
soup = inside(html_path=html_path,errors=errors)
for pair in zip(
soup.find_all('div',{"class": "audio_row__performers"}),soup.find_all('span',{"class": "audio_row__title_inner _audio_row__title_inner"}),{"class": "audio_row__title_inner_subtitle _audio_row__title_inner_subtitle"})
):
"""
pair[0] - musician(-s),pair[1] - track_name,pair[2] - subtitle for track(if any)
"""
track_author = pair[0].find('a').text
pair_2_str = str(pair[2])
regex = "(?<=>).*?(?=<)"
add_Meta = findall(regex,pair_2_str)[0]
track_name = pair[1].text + f" {add_Meta}"
use_dict.update({track_author: track_name})
return use_dict
用errors='replace'
执行后,我得到的是这样的东西:
('The Offspring','Dividing By Zero ')
('ACDC','Hightway to Hell ')
('����(�.�. ���)','������ ����� �� ������ ')
('Haddaway',"What is love,baby don't hurt me. ")
("Guns N' Roses",'Catcher in the rye ')
('Queen','Dont stop me Now (�������� � ������) ')
('The Subways','Rock & Roll Queen ')
('Fetty Wap','Trap Queen ')
我以为我的页面有误,所以我看了一下元数据,不幸的是发现了这一点:
<div class="audio_row__performer_title">
<div onmouSEOver="setTitle(this)" class="audio_row__performers"><a href="https://vk.com/audio?performer=1&q=%D0%9A%D0%B8%D0%BD%D0%BE%28%D0%92.%D0%A0.%20%D0%A6%D0%BE%D0%B9%29">Кино(В.Р. Цой)</a></div>
<div class="audio_row__title _audio_row__title" onmouSEOver="setTitle(this)">
<span class="audio_row__title_inner _audio_row__title_inner">Группа крови на рукаве</span>
<span class="audio_row__title_inner_subtitle _audio_row__title_inner_subtitle"></span>
</div>
</div>
<div class="audio_row__info _audio_row__info"><div class="audio_row__duration audio_row__duration-s _audio_row__duration">3:59</div></div>
</div>
这意味着我有正确的页面,但是bs4中的解码func无法检测到这些符号(使用errors='strict'
执行此消息后,我得到了此消息):
Traceback (most recent call last):
File "/home/roman/VKMusic/ParserEXE.py",line 5,in <module>
use_dict = pairing(html_path=html_path,errors='strict')
File "/home/roman/VKMusic/Main1.py",line 17,in pairing
soup = inside(html_path=html_path,errors=errors)
File "/home/roman/VKMusic/Main1.py",line 10,in inside
soup = BeautifulSoup(fp,features='lxml')
File "/home/roman/anaconda3/lib/python3.8/site-packages/bs4/__init__.py",line 306,in __init__
markup = markup.read()
File "/home/roman/anaconda3/lib/python3.8/codecs.py",line 322,in decode
(result,consumed) = self._buffer_decode(data,self.errors,final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcc in position 457: invalid continuation byte
在这种情况下,HTML页面是使用Google Chrome下载的(我本人在Linux(Ubuntu)上和我的朋友在Windows 7上下载。两者的结果相同),但我也尝试使用Firefox并同样遇到此错误。 br />
我需要我的代码解析整个html,包括西里尔字母。
链接到HTML页面示例:https://drive.google.com/file/d/1FKhTlVErjAKI9L2iedJtmHaXpoyCBMdl/view?usp=sharing
解决方法
在函数“内部”中替换with open(html_path,errors=errors) as fp:
到
with open(html_path,errors=errors,encoding='cp1251') as fp:
输出:
...
'ЧёрныеОзёра':'За6дней', 'ЧёрныеОзёраиМарияНемцева':'Отпусти', 'ШарльГуно':'Опера“Фауст”,Вальпургиеваночь-Античныйтанец[“Маскив' 'Опере-2“]', 'Шопен':'Этюд«Революционый»№12доминор', 'гиперборея':'предтечи', '♫巫师王':'Взглянивокруг,оглянисьназад,духистобоюсвязаться' 'хотят。 Мирнетаковкакимкажетсяон,чудесамикаждый' 'окружен。 Всёвокругподвластноглазам,сделатьсвойвыбор' 'должентысам,встретьсвоюсудьбу-бытьшаманом。 Королём,' 'всехшаманов,королёмеслид'}