问题描述
不确定为什么在链接正常工作时会发生此错误。当我单击链接时,将被带到原始数据,但是当我运行代码时,出现404错误。
import pandas as pd
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/isaiahxcruz/Datasets/master/TSLA_data"
TESLA_PATH = os.path.join("datasets","tesla")
TESLA_URL = DOWNLOAD_ROOT + "datasets/tesla/tesla.tgz"
def fetch_tesla_data(tesla_url=TESLA_URL,tesla_path=TESLA_PATH):
if not os.path.isdir(tesla_path):
os.makedirs(tesla_path)
tgz_path = os.path.join(tesla_path,"tesla.tgz")
urllib.request.urlretrieve(tesla_url,tgz_path)
tesla_tgz = tarfile.open(tgz_path)
tesla_tgz.extractall(path=tesla_path)
tesla_tgz.close()
fetch_tesla_data()
解决方法
您输入的URL错误。 TESLA_URL为https://raw.githubusercontent.com/isaiahxcruz/Datasets/master/TSLA_datadatasets/tesla/tesla.tgz
实际网址为https://raw.githubusercontent.com/isaiahxcruz/Datasets/master/TSLA_data
您在URL中添加了一些单词,使其与原始单词有所不同。
只需将您的代码修改为以下内容
import pandas as pd
import os
import tarfile
from six.moves import urllib
TESLA_PATH = os.path.join("datasets","tesla")
# This is what I changed
# I got rid of DOWNLOAD_ROOT and just threw the link in TESLA_URL
TESLA_URL = "https://raw.githubusercontent.com/isaiahxcruz/Datasets/master/TSLA_data"
def fetch_tesla_data(tesla_url=TESLA_URL,tesla_path=TESLA_PATH):
if not os.path.isdir(tesla_path):
os.makedirs(tesla_path)
tgz_path = os.path.join(tesla_path,"tesla.tgz")
urllib.request.urlretrieve(tesla_url,tgz_path)
tesla_tgz = tarfile.open(tgz_path)
tesla_tgz.extractall(path=tesla_path)
tesla_tgz.close()
fetch_tesla_data()
,
有两个问题
-
错误的网址-仅需要
DOWNLOAD_ROOT
而不需要"datasets/tesla/tesla.tgz"
-
它提供原始文本(不是压缩文件),并且不需要
tarfile
和extractall
import os
from six.moves import urllib
URL = "https://raw.githubusercontent.com/isaiahxcruz/Datasets/master/TSLA_data"
LOCAL_FOLDER = "datasets" # or "datasets/tesla"
LOCAL_FILENAME = "tesla.csv"
LOCAL_PATH = os.path.join(LOCAL_FOLDER,LOCAL_FILENAME)
os.makedirs(LOCAL_FOLDER,exist_ok=True)
urllib.request.urlretrieve(URL,LOCAL_PATH)
df = pd.read_csv(LOCAL_PATH)
print(df)
仅此而已。