数据分析:
·正则表达式:
Test1(正则表达式 - 字符串拆分):
代码:
import re # 拆分字符串 one = 'asdfsdfas' # 标准是以s为拆分 pattern = re.compile('s') result = pattern.split(one) print(result)
返回:
['a', 'df', 'dfa', '']
Test2(正则表达式 - 匹配中文):
代码1:
# 匹配中文 two = '<h2 tid="tid-YkerKe" id="hid-Htc8Nb">Test4(正则表达式 - 纯数字的正则):</h2>' # python中匹配中间 [a-z] unicode的范围 pattern = re.compile('[\u4e00-\u9fa5]') result = pattern.findall(two) print(result)匹配中文
返回1:
['正', '则', '表', '达', '式', '纯', '数', '字', '的', '正', '则']
代码2:
import re # 匹配中文 two = '<h2 tid="tid-YkerKe" id="hid-Htc8Nb">Test4(正则表达式 - 纯数字的正则):</h2>' # python中匹配中间 [a-z] unicode的范围 pattern = re.compile('[\u4e00-\u9fa5]+') result = pattern.findall(two) print(result)
返回2:
['正则表达式', '纯数字的正则']
Test3(正则表达式 - 网站爬取):
代码:
import re import requests url = 'https://news.baidu.com/' handers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" } data = requests.get(url, headers=handers).content.decode() with open('02news.html', 'w', encoding='utf-8')as f: f.write(data)
返回:
Test4:(正则表达式 - 新闻页面简单爬取):
代码1:
# coding=gbk import re import requests url = 'https://news.baidu.com/' handers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" } data = requests.get(url, headers=handers).content.decode() # '<a href="http://politics.people.com.cn/n1/2021/0303/c1001-32040808.html" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=0">人民的信心和支持就是我们国家奋进的力量</a>' pattern = re.compile('<a href="(.*?)" target="_blank" mon="(.*?)">(.*?)</a>') result = pattern.findall(data) print(result)
返回1:
代码2:
# coding=gbk import re import requests url = 'https://news.baidu.com/' handers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" } data = requests.get(url, headers=handers).content.decode() # '<a href="http://politics.people.com.cn/n1/2021/0303/c1001-32040808.html" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=0">人民的信心和支持就是我们国家奋进的力量</a>' # pattern = re.compile('<a(.*?)</a>') result = pattern.findall(data) print(result)
返回2:
·XPATH:
Test1(xpath基本用法):
代码:
# coding=gbk import re import requests from lxml import etree url = 'https://news.baidu.com/' handers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" } data = requests.get(url, headers=handers).content.decode() # 1.转解析类型 xpath_data = etree.HTML(data) # xpath # 2.调用xpath方法 result1 = xpath_data.xpath('/html/head/title/text()') result2 = xpath_data.xpath('//a/text()') result3 = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=0"]/text()') result4 = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=0"]/@href') result5 = xpath_data.xpath('//li/a/text()') print(result1) print(result2) print(result3) print(result4) print(result5)
返回:
注:
xpath语法: 1.节点: / 2.跨节点:// 3.精确的标签://a[@属性="属性值"] 4.标签包裹的内容 /text() 5.属性:@href 6.xpath返回的数据类型 —— List
xpath下标是从1开始的;只能取平级关系的标签
Test2(实战):
以https://www.cnblogs.com/3cH0-Nu1L/default.html?page=为例77
代码:
# coding=gbk import requests from lxml import etree class BkySpider(object): def __init__(self): self.base_url = 'https://www.cnblogs.com/3cH0-Nu1L/default.html?page=' self.handers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" } # 1.发请求 def get_response(self, url): response = requests.get(url, headers=self.handers) data = response.content.decode() return data # 2.解析数据 def parse_data(self, data): # 使用xpath解析当前页面所有的随笔title # 1.转类型 x_data = etree.HTML(data) # 2.根据xpath路径解析 title_list = x_data.xpath('//a[@class="postTitle2 vertical-middle"]/text()') url_list = x_data.xpath('//a[@class="postTitle2 vertical-middle"]/@href') print(result) # 3.保存数据 def save_data(self, data): with open('05bky.html', 'w', encoding='utf-8')as f: f.write(data) # 4.启动 def run(self): # 1.拼接完整URL url = self.base_url + '2' # 2.发请求 data = self.get_response(url) # 3.做解析 self.parse_data(data) # 4.保存 #self.save_data(data) BkySpider().run()