问题描述
我是一名 roockie python 开发人员,并尝试构建一个网络爬虫项目。 我从构建所有函数开始,对它们进行排序并一次执行每个函数。 一切正常,输出是一个包含所有信息的 .csv 文件,直到我决定将所有函数移动到另一个名为 Function.py 的页面并从 main.py 调用它们并意识到我还需要线程。>
所以主要对象是获取我可以从网站/子文件夹中获取的所有详细信息。 这是 main.py 代码:
import csv
from Functions import *
from threading import *
csv_file = open('scrap.csv','w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['ID','Name','Faculty','Class','Picture','Rank','Link'])
threads = []
lock = Lock()
result_info = []
def main(link):
list_of_departments = Thread(target=fetch_departments,args=(link,result_info,))
list_of_departments.start()
threads.append(list_of_departments)
peoples_links = Thread(target=create_people_links,args=(list_of_departments,))
peoples_links.start()
threads.append(peoples_links)
peoples_info = Thread(target=fetch_first_info,args=(peoples_links,))
peoples_info.start()
threads.append(peoples_info)
details = Thread(target=fetch_people_details,args=(peoples_info,))
details.start()
threads.append(details)
yale_link = "https://www.yale.edu/academics/departments-programs"
inst_id = 0
results = main(yale_link)
t = Thread(target=main,args=(yale_link,))
t.start()
for thread in threads:
t.join()
print(results)
"""for inst in results:
if inst_id > len(results):
break
else:
print([inst_id,inst[0]['name'],inst[0]['faculty'],inst[0]['class'],inst[0]['img'],inst[0]['title'],inst[0]['link']])
inst_id += 1
csv_file.close()"""
和 Functions.py 代码是:
from bs4 import BeautifulSoup
import requests
def fetch_departments(link,result_info):
extracted_links = []
url = requests.get(link).text
soup = BeautifulSoup(url,'lxml')
departments = soup.find_all('div',class_="department_item_cell_60")
for department in departments:
department_name = department.h4.a.text
department_link = department.h4.a['href']
extracted_links.append([department_name,department_link])
result_info.append([{'faculty': department_name}])
return extracted_links
def create_people_links(list_of_links):
subfolder_links = []
for link in list_of_links:
subfolder_links.append(f'{str(link[1])[:str(link).index(".edu")] + "/people"}')
return subfolder_links
def fetch_first_info(subdirectory_links,result_info):
index = 0
entities_links = []
for link in subdirectory_links:
url = requests.get(link).text
soup = BeautifulSoup(url,'lxml')
people_list = soup.find_all('td',class_="views-field-name")
people_class = soup.title.text
try:
people_class = str(people_class).splitlines().pop(1)
people_class = people_class[people_class.index(" | ") + 3:]
except Exception as e:
print(e)
people_class = soup.title.text
finally:
for people in people_list:
people_name = people.a.text
people_subfolder = people.a['href']
http_sub_folder = str(people_subfolder)[str(people_subfolder).index("/people") + 7:]
new_folder_link = f'{link + http_sub_folder}'
result_info[index][0].update({'id': index,'name': people_name,'class': people_class,'link': new_folder_link})
index += 1
entities_links.append(new_folder_link)
return entities_links
def fetch_people_details(list_of_links,result_info):
index = 0
for link in list_of_links:
if index > len(list_of_links):
break
else:
url = requests.get(link).text
soup = BeautifulSoup(url,'lxml')
instance_title = soup.find('div',class_="field-item even")
instance_pic = soup.find('a',title="View user profile.")
if instance_title:
result_info[index][0]['title'] = instance_title.text
else:
result_info[index][0]['title'] = "non available"
if instance_pic:
does_instance_pic = "yes"
else:
does_instance_pic = "no"
result_info[index][0]['img'] = does_instance_pic
index += 1
从那以后,我收到了各种各样的错误(例如 TypeError: 'Thread' object is not iterable )和 IndexErrors。 我也想面向对象,不知道从哪里开始.. 有人可以告诉我我哪里错了,以及我如何启动面向对象的线索吗?
对不起,如果重复,我是新手,我会提供任何帮助 谢谢!!
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)