爬虫数据处理:根据关键词寻找合适的职位
背景
最近又到了招聘季,有那么多招聘网站,我们该如何根据关键词找出合适的招聘信息呢?本文根据之前的scrapy爬虫结果,对数据进行过滤,寻找合适的职位信息。
数据:
使用Scrapy爬取的职位名称以及对应的业务详情,分别为abstract.json和detail.json。其中
- abstract.json:包括subject、title、link、author、date等,表示职位类型、职位名称、链接、发布者、发布时间等
- detail.json:包括title、detail,表示职位名称和业务详情。
要求是根据指定的关键字列表从数据中提取最近一周的职位信息。
代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName :wordfilter.py
# @Time :2022/1/19 17:25
# @Author :PangXZ
import json
import re
import datetime
import numpy as np
KEYWORDS = ['xxxx', 'xx', 'xx', 'xxx', 'xxxx']
def load_detail(document):
file = open(document, 'r', encoding='utf-8')
subject = []
for line in file.readlines():
line = line.rstrip(',\n')
content = json.loads(line)
title = content['title']
detail = content['detail']
for keyword in KEYWORDS:
if keyword in str(detail):
title = re.sub(u"\\[.*?]", "", title).strip()
subject.append(title)
subject = list(set(subject))
return subject
def load_abstract(document):
file = open(document, 'r', encoding='utf-8')
abstract = []
today = datetime.date.today()
for line in file.readlines():
line = line.rstrip(',\n')
content = json.loads(line)
title = content['title']
link = content['link']
date = content['date']
year, month, day = map(int, date.split('-'))
if date is not None:
cursor = datetime.date(year, month, day)
if (today - cursor).days < 7:
abstract.append([title, link, date])
return abstract
def get_right_job(position, abstract):
results = []
source_position = list(np.array(abstract).T[0])
source_link = list(np.array(abstract).T[1])
source_date = list(np.array(abstract).T[2])
target_position = position
right_position = list(set(source_position) & set(target_position))
for position in right_position:
i = source_position.index(position)
results.append([position, source_link[i], source_date[i]])
return results
if __name__ == "__main__":
detail_json = 'detail.json'
Positions = load_detail(document=detail_json)
abstract_json = 'abstract.json'
Abstracts = load_abstract(document=abstract_json)
output = get_right_job(position=Positions, abstract=Abstracts)
for pre in output:
print(pre)
执行结果如下: