问题描述
我正在尝试比较不同的行,以了解某一行是否在另一行之上,如果不行,则在x
处发生这种变化。
如果我具有相同的x
值和相同的长度,那将非常容易,并且仅在行的y
中有所区别。
但是对于不同的线,我有不同的x
值,向量的长度也不相同,但是所有曲线的x
间隔都是相同的。
作为一个非常简单的示例,我使用以下数据:
#curve 1: len = 9
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2: len = 10
x2 = np.array([3,4,5,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3: len = 8
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,270,350,380,400,390,380])
它们应该是2条回归线。在这个简单的示例中,结果应该是在所有x
范围内,曲线2具有比曲线1高的值。
我试图将x
的bin长度设为2.5-12.5,并将bin的长度设为1,以比较每个bin中相应的y
。
我的实际数据很大,这种比较需要做很多次,所以我需要找到一个不需要很多时间的解决方案。
情节
- 给定x轴的数据图
plt.figure(figsize=(6,6))
plt.plot(x1,y1,marker='o',label='y1')
plt.plot(x2,y2,label='y2')
plt.plot(x3,y3,label='y3')
plt.xticks(range(15))
plt.legend()
plt.grid()
plt.legend(bBox_to_anchor=(1.05,1),loc='upper left')
解决方法
功能
-
def get_new_x
使用np.digitize
重新绑定x轴值。 -
def get_comparison
为比较的每两列添加一列布尔值- 当前,每个新列都已添加到主数据框
df
,但是可以将其更新为单独的comparison
数据框。 -
combs
是列表列组合-
[Index(['y1','y2'],dtype='object'),Index(['y2','y3'],dtype='object')]
-
- 当前,每个新列都已添加到主数据框
# function to create the bins
def get_bins(x_arrays: List[np.array]) -> np.array:
bin_len = np.diff(x_arrays[0][:2]) # calculate bin length
all_x = np.concatenate(x_arrays) # join arrays
min_x = min(all_x) # get min
max_x = max(all_x) # get max
return np.arange(min_x,max_x + bin_len,bin_len)
# function using np.digitize to bin the old x-axis into new bins
def get_new_x(x_arrays: List[np.array]) -> List[np.array]:
bins = get_bins(x_arrays) # get the bins
x_new = list()
for x in x_arrays:
x_new.append(bins[np.digitize(np.round(x),bins,right=True)]) # determine bins
return x_new
# function to create dataframe for arrays with new x-axis as index
def get_df(x_arrays: List[np.array],y_arrays: List[np.array]) -> pd.DataFrame:
x_new = get_new_x(x_arrays)
return pd.concat([pd.DataFrame(y,columns=[f'y{i+1}'],index=x_new[i]) for i,y in enumerate(y_arrays)],axis=1)
# compare each successive column of the dataframe
# if the left column is greater than the right column,then True
def get_comparison(df: pd.DataFrame):
cols = df.columns
combs = [cols[i:i+2] for i in range(0,len(cols),1) if i < len(cols)-1]
for comb in combs:
df[f'{comb[0]} > {comb[1]}'] = df[comb[0]] > df[comb[1]]
调用函数:
import numpy as np
import pandas as pd
# put the arrays into a list
y = [y1,y2,y3]
x = [x1,x2,x3]
# call get_df
df = get_df(x,y)
# call get_comparison
get_comparison(df)
# get only the index of True values with Boolean indexing
for col in df.columns[3:]:
vals = df.index[df[col]].tolist()
if vals:
print(f'{col}: {vals}')
[out]:
y2 > y3: [8.0]
display(df)
y1 y2 y3 y1 > y2 y2 > y3
3.0 NaN 90.0 NaN False False
4.0 NaN 210.0 NaN False False
5.0 100.0 211.0 NaN False False
6.0 101.0 250.0 NaN False False
7.0 110.0 260.0 300.0 False False
8.0 130.0 261.0 250.0 False True
9.0 132.0 265.0 270.0 False False
10.0 170.0 180.0 350.0 False False
11.0 190.0 200.0 380.0 False False
12.0 192.0 210.0 400.0 False False
13.0 210.0 NaN 390.0 False False
14.0 NaN NaN 380.0 False False
情节
fig,ax = plt.subplots(figsize=(8,6))
# add markers for problem values
for i,col in enumerate(df.columns[3:],1):
vals = df.iloc[:,i][df[col]]
if not vals.empty:
ax.scatter(vals.index,vals.values,color='red',s=110,label='bad')
df.iloc[:,:3].plot(marker='o',ax=ax) # plot the dataframe
plt.legend(bbox_to_anchor=(1.05,1),loc='upper left')
plt.xticks(range(16))
plt.title('y-values plotted against rebinned x-values')
plt.grid()
plt.show()
,
这是我最初问这个问题时脑海中想得到的答案,但当时无法解决。我的想法是基于基于x的y1和y2的合并,并在每个bin中比较这两个。因此,作为示例,我有3条曲线,我想对其进行比较。这些曲线中唯一类似的东西是delta x
(箱长),此处为1。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#curve 1
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])
#curve 2
x2 = np.array([3,4,5,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])
#curve 3
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,270,350,380,400,390,380])
bin_length = 1
# x values have same intervals both in x1 and x2
x_min = min(x1[0],x2[0],x3[0])-bin_length/2
x_max = max(x1[-1],x2[-1],x3[-1])+bin_length/2
bins = np.arange(x_min,x_max+bin_length,bin_length)
# bin mid points to use as index
bin_mid = []
for i in range(len(bins)-1):
# compute mid point of the bins
bin_mid.append((bins[i] + bins[i+1])/2)
# This function bins y based on binning x
def bin_fun(x,y,bin_length):
c = list(zip(x,y))
# define final out put of the function
final_y_binning = []
# define a list for holding members of each bin
bined_y_members = []
# compute length of each bin
for i in range(len(bins)-1):
# compute high and low threshold of the bins
low_threshold = bins[i]
high_threshold = bins[i+1]
# bin y according to x
for member in c:
if (member[0] < high_threshold and member[0] >= low_threshold):
bined_y_members.append(member[1])
final_y_binning.append(bined_y_members)
# fill out the container of the bin members
bined_y_members=[]
df = pd.DataFrame(final_y_binning)
return(df)
binned_y =pd.DataFrame(columns=[1,2,3])
Y1 = bin_fun(x1,y1,bin_length)
Y1.columns =[1]
Y2 = bin_fun(x2,bin_length)
Y2.columns =[2]
Y3 = bin_fun(x3,y3,bin_length)
Y3.columns =[3]
binned_y = binned_y.append(Y1)
binned_y[2] = Y2
binned_y[3] = Y3
binned_y.index = bin_mid
print(binned_y)
# comparing curve 2 and curve 1
for i in binned_y.index:
if (binned_y.loc[i][2]-binned_y.loc[i][1]<0):
print(i)
# comparing curve 3 and curve 2
for i in binned_y.index:
if (binned_y.loc[i][3]-binned_y.loc[i][2]<0):
print(i)
这将返回8,这是y3
binned_y
1 2 3
3.0 NaN 90.0 NaN
4.0 NaN 210.0 NaN
5.0 100.0 211.0 NaN
6.0 101.0 250.0 NaN
7.0 110.0 260.0 300.0
8.0 130.0 261.0 250.0
9.0 132.0 265.0 270.0
10.0 170.0 180.0 350.0
11.0 190.0 200.0 380.0
12.0 192.0 210.0 400.0
13.0 210.0 NaN 390.0
14.0 NaN NaN 380.0
15.0 NaN NaN NaN
情节
binned_y.plot(marker='o',figsize=(6,6)) # plot the dataframe
plt.legend(labels=['y1','y2',bbox_to_anchor=(1.05,loc='upper left')
plt.xticks(range(16))
plt.grid()