问题描述
我正在从Google Scholar抓取数据,我想将其存储到数据库中,我可以使用前2个表来处理它,但是我不能将外键插入到第三个表中,我认为这些表是很好,因为我创建它们时就没问题了,所以当我尝试在其中插入数据时就会出现问题...我也想知道如何只一次将一张纸插入“ Publicacion”表中,因为一张纸可以在其他作者的个人资料中,但在该表中,我只需要一次
chrome_path = r"chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
wait = W(driver,1)
papers = []
try:
conn = sqlite3.connect('scholar.db')
except:
print("No connection")
sys.exit(0)
conn.execute("PRAGMA foreign_keys = 1")
cur=conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS Autor (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,Nombre_Autor TEXT NOT NULL,Titulo_Paper TEXT NOT NULL,Citado_Por TEXT NOT NULL,Año TEXT NOT NULL,Id_Paper TEXT NOT NULL,Link_Paper TEXT NOT NULL
)""")
cur.execute("""CREATE TABLE IF NOT EXISTS Publicacion (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,Id_Paper TEXT NOT NULL
)""")
cur.execute("""CREATE TABLE IF NOT EXISTS Autor_Publicacion(
A_id INTEGER,P_id INTEGER,FOREIGN KEY(A_id) REFERENCES Autor(id),FOREIGN KEY(P_id) REFERENCES Publicacion(id)
)""")
conn.commit()
# https://scholar.google.com/citations?hl=en&user=2BTxUj4AAAAJ
urls = []
with open(r'perfil.csv','r') as f:
for line in f:
urls.append(line)
for url in urls:
driver.get(url)
more = driver.find_element_by_class_name('gs_btnPD')
for _ in range(0,5):
ActionChains(driver).click(more).perform()
time.sleep(1)
while True:
soup = BeautifulSoup(driver.page_source,'html.parser')
citation_indices = soup.find('table',attrs={'id': 'gsc_rsb_st'})
research_article = soup.find_all('tr',{'class': 'gsc_a_tr'})
author_details = soup.find('div',{'id': 'gsc_prf_i'})
time.sleep(3)
try:
for i,research in enumerate(research_article,1):
name = author_details.find('div',attrs={'id': 'gsc_prf_in'}).text
pub_details = research.find('td',attrs={'class': 'gsc_a_t'})
pub_ref = pub_details.a['href']
pub_Meta = pub_details.find_all('div')
title = pub_details.a.text
cited_by = research.find('a',attrs={'class': 'gsc_a_ac'}).text or ''
year = research.find('span',attrs={'class': 'gsc_a_h'}).text or ''
idpaper = research.find('a',attrs={'class': 'gsc_a_at'})
d = idpaper.get('data-href')
linkpaper = urllib.parse.urljoin("https://scholar.google.com",d)
parsed = urllib.parse.urlparse(d)
mydata = ([name,title,cited_by,year,parsed.query,linkpaper])
mydata2 = ([name,parsed.query])
papers.append(mydata)
papers.append(mydata2)
cur.executemany("""INSERT INTO Autor (Nombre_Autor,Titulo_Paper,Citado_Por,Año,Id_Paper,Link_Paper) VALUES (?,?,?)""",[mydata])
A_id = cur.lastrowid
cur.executemany("""INSERT INTO Publicacion (Nombre_Autor,Id_Paper) VALUES (?,[mydata2])
P_id = cur.lastrowid
cur.executemany("""INSERT INTO Autor_Publicacion (A_id,P_id) VALUES(?,[A_id,P_id])
conn.commit()
except:
pass
if len(research_article) != 100:
print(f'Page {url} scraped')
break
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)