问题描述
我正在使用本教程来清理足球数据。 Tutorial
当我尝试为我的数据执行这段代码时:
# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat):
# Create a dictionary with team names as keys
teams = {}
for i in playing_stat.groupby('HomeTeam').mean().T.columns:
teams[i] = []
# the value corresponding to keys is a list containing the match location.
for i in range(len(playing_stat)):
HTGS = playing_stat.iloc[i]['FTHG']
ATGS = playing_stat.iloc[i]['FTAG']
teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
teams[playing_stat.iloc[i].AwayTeam].append(ATGS)
# Create a dataframe for goals scored where rows are teams and cols are matchweek.
Goalsscored = pd.DataFrame(data=teams,index = [i for i in range(1,37)]).T
Goalsscored[0] = 0
# Aggregate to get uptil that point
for i in range(2,37):
Goalsscored[i] = Goalsscored[i] + Goalsscored[i-1]
return Goalsscored
# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat):
# Create a dictionary with team names as keys
teams = {}
for i in playing_stat.groupby('HomeTeam').mean().T.columns:
teams[i] = []
# the value corresponding to keys is a list containing the match location.
for i in range(len(playing_stat)):
ATGC = playing_stat.iloc[i]['FTHG']
HTGC = playing_stat.iloc[i]['FTAG']
teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
# Create a dataframe for goals scored where rows are teams and cols are matchweek.
GoalsConceded = pd.DataFrame(data=teams,37)]).T
GoalsConceded[0] = 0
# Aggregate to get uptil that point
for i in range(2,37):
GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
return GoalsConceded
def get_gss(playing_stat):
GC = get_goals_conceded(playing_stat)
GS = get_goals_scored(playing_stat)
j = 0
HTGS = []
ATGS = []
HTGC = []
ATGC = []
for i in range(313):
ht = playing_stat.iloc[i].HomeTeam
at = playing_stat.iloc[i].AwayTeam
HTGS.append(GS.loc[ht][j])
ATGS.append(GS.loc[at][j])
HTGC.append(GC.loc[ht][j])
ATGC.append(GC.loc[at][j])
if ((i + 1)% 10) == 0:
j = j + 1
playing_stat['HTGS'] = HTGS
playing_stat['ATGS'] = ATGS
playing_stat['HTGC'] = HTGC
playing_stat['ATGC'] = ATGC
return playing_stat
# Apply to each dataset
playing_statistics_1 = get_gss(playing_statistics_1)
playing_statistics_2 = get_gss(playing_statistics_2)
playing_statistics_3 = get_gss(playing_statistics_3)
playing_statistics_4 = get_gss(playing_statistics_4)
我收到此错误:ValueError: Could not broadcast input array from shape (34) into shape (36)
我的数据有 313 条记录,如下所示:
我知道这与数据的长度有关,但我无法弄清楚它到底是什么。
非常感谢任何帮助,如果需要任何额外信息,请告诉我!
解决方法
虽然本教程中需要进行大量修复(我不打算讨论)。我根本没有收到那个错误。当您下载文件时,其中一个文件是否可能已损坏(我得到 380 行/记录,而不是 318)??尝试将您在本地读取文件的第一个块更改为下面的代码,以便您直接从源文件中读取文件。报告发生的情况:
loc = "https://raw.githubusercontent.com/RudrakshTuwani/Football-Data-Analysis-and-Prediction/master/Datasets/"
raw_data_1 = pd.read_csv(loc + '2000-01.csv')
raw_data_2 = pd.read_csv(loc + '2001-02.csv')
raw_data_3 = pd.read_csv(loc + '2002-03.csv')
raw_data_4 = pd.read_csv(loc + '2003-04.csv')
raw_data_5 = pd.read_csv(loc + '2004-05.csv')
raw_data_6 = pd.read_csv(loc + '2005-06.csv')
raw_data_7 = pd.read_csv(loc + '2006-07.csv')
raw_data_8 = pd.read_csv(loc + '2007-08.csv')
raw_data_9 = pd.read_csv(loc + '2008-09.csv')
raw_data_10 = pd.read_csv(loc + '2009-10.csv')
raw_data_11 = pd.read_csv(loc + '2010-11.csv')
raw_data_12 = pd.read_csv(loc + '2011-12.csv')
raw_data_13 = pd.read_csv(loc + '2012-13.csv')
raw_data_14 = pd.read_csv(loc + '2013-14.csv')
raw_data_15 = pd.read_csv(loc + '2014-15.csv')
raw_data_16 = pd.read_csv(loc + '2015-16.csv')