问题描述
urls = []
for page in pages:
page = "https://www.golddist.com/index.PHP?skip="+str(page)+"&m=search&instock=1"
urls.append(page)
#print(urls)
for pgur in urls:
browser.get(pgur)
time.sleep(1)
manufs = browser.find_elements_by_xpath('/html/body/div/div[2]/div[2]/div/form/div[1]/table/tbody/tr[position()>1]/td[1]/a/b/font/span')
Identifier = []
for part1 in manufs:
manuf = part1.text
Identifier.append(manuf)
print(Identifier)
heads = browser.find_elements_by_xpath('/html/body/div/div[2]/div[2]/div/form/div[1]/table/tbody/tr[position()>1]/td[2]/span/a/b')
Title = []
for part2 in heads:
head = part2.text
Title.append(head)
#print(Title)
stocks = browser.find_elements_by_xpath('/html/body/div/div[2]/div[2]/div/form/div[1]/table/tbody/tr[position()>1]/td[3]/center/b/font/font')
Note = []
for part3 in stocks:
stock = part3.text
Note.append(stock)
#print(Note)
pri = browser.find_elements_by_xpath('/html/body/div/div[2]/div[2]/div/form/div[1]/table/tbody/tr[position()>1]/td[5]/center/b/font')
Price = []
for part4 in pri:
pric = part4.text
Price.append(pric)
#print(Price)
data = {'Identifier': Identifier,'Title': Title,'Price': Price,'Note': Note}
df = pd.DataFrame(data)
df.to_csv('GD1.csv',index = False)
如何从所有页面追加数据而不是不断替换新页面中的行? 我想从所有页面中抓取产品数据。但我的脚本只是用从新页面抓取的数据刷新 CSV 文件。
解决方法
使用列表在每次迭代时存储数据,最后将所有结果连接到单个数据框中:
ldf = [] # <- create a list to store dataframes for each url
for pgur in urls:
browser.get(pgur)
time.sleep(1)
...
data = {'Identifier': Identifier,'Title': Title,'Price': Price,'Note': Note}
df = pd.DataFrame(data)
ldf.append(df) # <- append your dataframe to the list
pd.concat(ldf).to_csv('GD1.csv',index=False) # <- overwrite csv with all data
df = pd.concat(ldf) # <- concatenate all dataframes