为什么这段代码可以正确地进行抓取但不保存任何csv或excel文件?

问题描述

我从“算法学习的机器学习”一书中获得了这段代码。这是用于简化报价公司的seaking alpha.com的代码。 我看到代码正确地进行了抓取,但是然后看不到带有抓取结果的已保存文件。 我认为路径可能是错误的,但我不确定。我正在使用Macbook pro和Spider。 谢谢 阿莱西奥


__author__ = 'Stefan Jansen'

import re
from pathlib import Path
from random import random
from time import sleep
from urllib.parse import urljoin

import pandas as pd
from bs4 import BeautifulSoup
from furl import furl
from selenium import webdriver

transcript_path = Path('transcripts')

def store_result(Meta,participants,content):
    """Save parse content to csv"""
    path = transcript_path / 'parsed' / Meta['symbol']
    if not path.exists():
        path.mkdir(parents=True,exist_ok=True)
    pd.DataFrame(content,columns=['speaker','q&a','content']).to_csv('content.csv',index=False)
    pd.DataFrame(participants,columns=['type','name']).to_csv('participants.csv',index=False)
    pd.Series(Meta).to_csv('earnings.csv')
    

def parse_html(html):
    """Main html parser function"""
    date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
    quarter_pattern = re.compile(r'(\bQ\d\b)')
    soup = BeautifulSoup(html,'html.parser')

    Meta,content = {},[],[]
    h1 = soup.find('h1',itemprop='headline')
    if h1 is None:
        return
    h1 = h1.text
    Meta['company'] = h1[:h1.find('(')].strip()
    Meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]

    title = soup.find('div',class_='title')
    if title is None:
        return
    title = title.text
    print(title)
    match = date_pattern.search(title)
    if match:
        m,d,y = match.groups()
        Meta['month'] = int(m)
        Meta['day'] = int(d)
        Meta['year'] = int(y)

    match = quarter_pattern.search(title)
    if match:
        Meta['quarter'] = match.group(0)

    qa = 0
    speaker_types = ['Executives','Analysts']
    for header in [p.parent for p in soup.find_all('strong')]:
        text = header.text.strip()
        if text.lower().startswith('copyright'):
            continue
        elif text.lower().startswith('question-and'):
            qa = 1
            continue
        elif any([type in text for type in speaker_types]):
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    participants.append([text,participant.text])
        else:
            p = []
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    p.append(participant.text)
            content.append([header.text,qa,'\n'.join(p)])
    return Meta,content


SA_URL = 'https://seekingalpha.com/'
TRANSCRIPT = re.compile('Earnings Call Transcript')

next_page = True
page = 1
driver = webdriver.Firefox(executable_path='/Users/alessiomontani/Documents/0_Python/CODE/Machine-Learning-for-Algorithmic-Trading-Second-Edition-master/03_alternative_data/02_earnings_calls copy/geckodriver')
while next_page:
    print(f'Page: {page}')
    url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
    driver.get(urljoin(SA_URL,url))
    sleep(8 + (random() - .5) * 2)
    response = driver.page_source
    page += 1
    soup = BeautifulSoup(response,'html.parser')
    links = soup.find_all(name='a',string=TRANSCRIPT)
    if len(links) == 0:
        next_page = False
    else:
        for link in links:
            transcript_url = link.attrs.get('href')
            article_url = furl(urljoin(SA_URL,transcript_url)).add({'part': 'single'})
            driver.get(article_url.url)
            html = driver.page_source
            result = parse_html(html)
            if result is not None:
                Meta,content = result
                Meta['link'] = link
                store_result(Meta,content)
                sleep(8 + (random() - .5) * 2)

driver.close()
#pd.Series(articles).to_csv('articles.csv')

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)