问题描述
我从“算法学习的机器学习”一书中获得了这段代码。这是用于简化报价公司的seaking alpha.com的代码。 我看到代码正确地进行了抓取,但是然后看不到带有抓取结果的已保存文件。 我认为路径可能是错误的,但我不确定。我正在使用Macbook pro和Spider。 谢谢 阿莱西奥
__author__ = 'Stefan Jansen'
import re
from pathlib import Path
from random import random
from time import sleep
from urllib.parse import urljoin
import pandas as pd
from bs4 import BeautifulSoup
from furl import furl
from selenium import webdriver
transcript_path = Path('transcripts')
def store_result(Meta,participants,content):
"""Save parse content to csv"""
path = transcript_path / 'parsed' / Meta['symbol']
if not path.exists():
path.mkdir(parents=True,exist_ok=True)
pd.DataFrame(content,columns=['speaker','q&a','content']).to_csv('content.csv',index=False)
pd.DataFrame(participants,columns=['type','name']).to_csv('participants.csv',index=False)
pd.Series(Meta).to_csv('earnings.csv')
def parse_html(html):
"""Main html parser function"""
date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
quarter_pattern = re.compile(r'(\bQ\d\b)')
soup = BeautifulSoup(html,'html.parser')
Meta,content = {},[],[]
h1 = soup.find('h1',itemprop='headline')
if h1 is None:
return
h1 = h1.text
Meta['company'] = h1[:h1.find('(')].strip()
Meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
title = soup.find('div',class_='title')
if title is None:
return
title = title.text
print(title)
match = date_pattern.search(title)
if match:
m,d,y = match.groups()
Meta['month'] = int(m)
Meta['day'] = int(d)
Meta['year'] = int(y)
match = quarter_pattern.search(title)
if match:
Meta['quarter'] = match.group(0)
qa = 0
speaker_types = ['Executives','Analysts']
for header in [p.parent for p in soup.find_all('strong')]:
text = header.text.strip()
if text.lower().startswith('copyright'):
continue
elif text.lower().startswith('question-and'):
qa = 1
continue
elif any([type in text for type in speaker_types]):
for participant in header.find_next_siblings('p'):
if participant.find('strong'):
break
else:
participants.append([text,participant.text])
else:
p = []
for participant in header.find_next_siblings('p'):
if participant.find('strong'):
break
else:
p.append(participant.text)
content.append([header.text,qa,'\n'.join(p)])
return Meta,content
SA_URL = 'https://seekingalpha.com/'
TRANSCRIPT = re.compile('Earnings Call Transcript')
next_page = True
page = 1
driver = webdriver.Firefox(executable_path='/Users/alessiomontani/Documents/0_Python/CODE/Machine-Learning-for-Algorithmic-Trading-Second-Edition-master/03_alternative_data/02_earnings_calls copy/geckodriver')
while next_page:
print(f'Page: {page}')
url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
driver.get(urljoin(SA_URL,url))
sleep(8 + (random() - .5) * 2)
response = driver.page_source
page += 1
soup = BeautifulSoup(response,'html.parser')
links = soup.find_all(name='a',string=TRANSCRIPT)
if len(links) == 0:
next_page = False
else:
for link in links:
transcript_url = link.attrs.get('href')
article_url = furl(urljoin(SA_URL,transcript_url)).add({'part': 'single'})
driver.get(article_url.url)
html = driver.page_source
result = parse_html(html)
if result is not None:
Meta,content = result
Meta['link'] = link
store_result(Meta,content)
sleep(8 + (random() - .5) * 2)
driver.close()
#pd.Series(articles).to_csv('articles.csv')
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)