使用线程和面向对象与网络爬虫纠缠

问题描述

我是一名 roockie python 开发人员，并尝试构建一个网络爬虫项目。我从构建所有函数开始，对它们进行排序并一次执行每个函数。一切正常，输出是一个包含所有信息的 .csv 文件，直到我决定将所有函数移动到另一个名为 Function.py 的页面并从 main.py 调用它们并意识到我还需要线程。>

所以主要对象是获取我可以从网站/子文件夹中获取的所有详细信息。这是 main.py 代码：

import csv
from Functions import *
from threading import *

csv_file = open('scrap.csv','w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['ID','Name','Faculty','Class','Picture','Rank','Link'])
threads = []
lock = Lock()
result_info = []


def main(link):
    list_of_departments = Thread(target=fetch_departments,args=(link,result_info,))
    list_of_departments.start()
    threads.append(list_of_departments)
    peoples_links = Thread(target=create_people_links,args=(list_of_departments,))
    peoples_links.start()
    threads.append(peoples_links)
    peoples_info = Thread(target=fetch_first_info,args=(peoples_links,))
    peoples_info.start()
    threads.append(peoples_info)
    details = Thread(target=fetch_people_details,args=(peoples_info,))
    details.start()
    threads.append(details)


yale_link = "https://www.yale.edu/academics/departments-programs"
inst_id = 0
results = main(yale_link)
t = Thread(target=main,args=(yale_link,))
t.start()

for thread in threads:
    t.join()

print(results)
"""for inst in results:
    if inst_id > len(results):
        break
    else:
        print([inst_id,inst[0]['name'],inst[0]['faculty'],inst[0]['class'],inst[0]['img'],inst[0]['title'],inst[0]['link']])
        inst_id += 1
csv_file.close()"""

和 Functions.py 代码是：

from bs4 import BeautifulSoup
import requests


def fetch_departments(link,result_info):
    extracted_links = []
    url = requests.get(link).text
    soup = BeautifulSoup(url,'lxml')
    departments = soup.find_all('div',class_="department_item_cell_60")

    for department in departments:
        department_name = department.h4.a.text
        department_link = department.h4.a['href']
        extracted_links.append([department_name,department_link])
        result_info.append([{'faculty': department_name}])

    return extracted_links


def create_people_links(list_of_links):
    subfolder_links = []
    for link in list_of_links:
        subfolder_links.append(f'{str(link[1])[:str(link).index(".edu")] + "/people"}')

    return subfolder_links


def fetch_first_info(subdirectory_links,result_info):
    index = 0
    entities_links = []
    for link in subdirectory_links:
        url = requests.get(link).text
        soup = BeautifulSoup(url,'lxml')
        people_list = soup.find_all('td',class_="views-field-name")
        people_class = soup.title.text
        try:
            people_class = str(people_class).splitlines().pop(1)
            people_class = people_class[people_class.index(" | ") + 3:]

        except Exception as e:
            print(e)
            people_class = soup.title.text

        finally:
            for people in people_list:
                people_name = people.a.text
                people_subfolder = people.a['href']
                http_sub_folder = str(people_subfolder)[str(people_subfolder).index("/people") + 7:]
                new_folder_link = f'{link + http_sub_folder}'
                result_info[index][0].update({'id': index,'name': people_name,'class': people_class,'link': new_folder_link})
                index += 1
                entities_links.append(new_folder_link)

    return entities_links


def fetch_people_details(list_of_links,result_info):
    index = 0
    for link in list_of_links:
        if index > len(list_of_links):
            break
        else:
            url = requests.get(link).text
            soup = BeautifulSoup(url,'lxml')
            instance_title = soup.find('div',class_="field-item even")
            instance_pic = soup.find('a',title="View user profile.")
            if instance_title:
                result_info[index][0]['title'] = instance_title.text
            else:
                result_info[index][0]['title'] = "non available"

            if instance_pic:
                does_instance_pic = "yes"
            else:
                does_instance_pic = "no"

            result_info[index][0]['img'] = does_instance_pic
        index += 1

从那以后，我收到了各种各样的错误（例如 TypeError: 'Thread' object is not iterable ）和 IndexErrors。我也想面向对象，不知道从哪里开始.. 有人可以告诉我我哪里错了，以及我如何启动面向对象的线索吗？

对不起，如果重复，我是新手，我会提供任何帮助谢谢！！

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

multithreading object-oriented-analysis python thread-synchronization web-crawler