问题描述
当前我正在打印数据。现在而不是打印我想导出到的数据
excel./csv是python pls帮助的新功能。
**大约9000行有6列的数据非常庞大吗?**
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def scrap_bid_data():
page_no = 1 #initial page number
while True:
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no) #create dynamic URL
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text,'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'}) #find divs which contains required data
if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script.
break
else:
for idx in range(len(extracted_data)): # loops through all the divs and extract and print data
if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes
bid_data = extracted_data.contents[idx].text.strip().split('\n')
print('-' * 100)
print(bid_data[0]) #BID number
print(bid_data[5]) #Items
print(bid_data[6]) #Quantitiy Required
print(bid_data[10] + bid_data[12].strip()) #Department name and address
print(bid_data[16]) #Start date
print(bid_data[17]) #End date
print('-' * 100)
page_no +=1 #increments the page number by 1
scrap_bid_data()
解决方法
你去...
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
f = csv.writer(open('gembid.csv','w'))
f.writerow(['Bidnumber','Items','Quantitiy','Department','Enddate'])
def scrap_bid_data():
page_no = 1
while page_no < 911:
print('Hold on creating URL to fetch data...')
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
print('URL created: ' + url)
scraped_data = requests.get(url,verify=False)
soup_data = bs(scraped_data.text,'lxml')
extracted_data = soup_data.find('div',{'id': 'pagi_content'})
if len(extracted_data) == 0:
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
bidno = bid_data[0].split(":")[-1]
items = bid_data[5].split(":")[-1]
qnty = int(bid_data[6].split(':')[1].strip())
dept = (bid_data[10] + bid_data[12].strip()).split(":")[-1]
edate = bid_data[17].split("End Date:")[-1]
f.writerow([bidno,items,qnty,dept,edate])
page_no=page_no+1
scrap_bid_data()
,
我认为您应该首先在函数末尾返回包含数据的extract_data对象。
containers = soup.find_all('td',class_=['TableRecords_EvenLine','TableRecords_OddLine'])
dateli = []
descli = []
amtli = []
for container in containers:
date = container.select('div[id*=wtDataMov]')
for element1 in date:
seci1 = element1.get_text()
dateli.append(seci1)
description = container.select('div[id*=wtDescricao]')
for element2 in description:
seci2 = element2.get_text()
descli.append(seci2)
amount = container.select('div[id*=wtValorEur]')
for element3 in amount:
seci3 = element3.get_text()
amtli.append(float(price_str(seci3)))
然后用它创建一个数据框
page_no = 1
def scrap_bid_data(page):
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page)
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text,'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'})
return extracted_data
然后导出此致命帧。
extract_data = scrap_bid_data(page_no)
import pandas as pd
df = pd.DataFrame(extract_data)