问题描述

我正在尝试比较不同的行，以了解某一行是否在另一行之上，如果不行，则在x处发生这种变化。

如果我具有相同的x值和相同的长度，那将非常容易，并且仅在行的y中有所区别。

但是对于不同的线，我有不同的x值，向量的长度也不相同，但是所有曲线的x间隔都是相同的。

作为一个非常简单的示例，我使用以下数据：

#curve 1: len = 9
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])

#curve 2: len = 10
x2 = np.array([3,4,5,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])

#curve 3: len = 8
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,270,350,380,400,390,380])

它们应该是2条回归线。在这个简单的示例中，结果应该是在所有x范围内，曲线2具有比曲线1高的值。

我试图将x的bin长度设为2.5-12.5，并将bin的长度设为1，以比较每个bin中相应的y。

我的实际数据很大，这种比较需要做很多次，所以我需要找到一个不需要很多时间的解决方案。

情节

给定x轴的数据图

plt.figure(figsize=(6,6))
plt.plot(x1,y1,marker='o',label='y1')
plt.plot(x2,y2,label='y2')
plt.plot(x3,y3,label='y3')
plt.xticks(range(15))
plt.legend()
plt.grid()
plt.legend(bBox_to_anchor=(1.05,1),loc='upper left')

解决方法

功能

def get_new_x使用np.digitize重新绑定x轴值。
def get_comparison为比较的每两列添加一列布尔值
- 当前，每个新列都已添加到主数据框df，但是可以将其更新为单独的comparison数据框。
- combs是列表列组合
  - [Index(['y1','y2'],dtype='object'),Index(['y2','y3'],dtype='object')]

# function to create the bins 
def get_bins(x_arrays: List[np.array]) -> np.array:
    bin_len = np.diff(x_arrays[0][:2])  # calculate bin length 
    all_x = np.concatenate(x_arrays)  # join arrays
    min_x = min(all_x)  # get min
    max_x = max(all_x)  # get max
    return np.arange(min_x,max_x + bin_len,bin_len)


# function using np.digitize to bin the old x-axis into new bins
def get_new_x(x_arrays: List[np.array]) -> List[np.array]:
    bins = get_bins(x_arrays)  # get the bins
    x_new = list()
    for x in x_arrays:
        x_new.append(bins[np.digitize(np.round(x),bins,right=True)])  # determine bins
    return x_new


# function to create dataframe for arrays with new x-axis as index
def get_df(x_arrays: List[np.array],y_arrays: List[np.array]) -> pd.DataFrame:
    x_new = get_new_x(x_arrays)
    return pd.concat([pd.DataFrame(y,columns=[f'y{i+1}'],index=x_new[i]) for i,y in enumerate(y_arrays)],axis=1)


# compare each successive column of the dataframe
# if the left column is greater than the right column,then True
def get_comparison(df: pd.DataFrame):
    cols = df.columns
    combs = [cols[i:i+2] for i in range(0,len(cols),1) if i < len(cols)-1]
    for comb in combs:
        df[f'{comb[0]} > {comb[1]}'] = df[comb[0]] > df[comb[1]]

调用函数：

import numpy as np
import pandas as pd

# put the arrays into a list
y = [y1,y2,y3]
x = [x1,x2,x3]

# call get_df
df = get_df(x,y)

# call get_comparison
get_comparison(df)

# get only the index of True values with Boolean indexing
for col in df.columns[3:]:
    vals = df.index[df[col]].tolist()
    if vals:
        print(f'{col}: {vals}')

[out]:
y2 > y3: [8.0]

display（df）

         y1     y2     y3  y1 > y2  y2 > y3
3.0     NaN   90.0    NaN    False    False
4.0     NaN  210.0    NaN    False    False
5.0   100.0  211.0    NaN    False    False
6.0   101.0  250.0    NaN    False    False
7.0   110.0  260.0  300.0    False    False
8.0   130.0  261.0  250.0    False     True
9.0   132.0  265.0  270.0    False    False
10.0  170.0  180.0  350.0    False    False
11.0  190.0  200.0  380.0    False    False
12.0  192.0  210.0  400.0    False    False
13.0  210.0    NaN  390.0    False    False
14.0    NaN    NaN  380.0    False    False

情节

fig,ax = plt.subplots(figsize=(8,6))

# add markers for problem values
for i,col in enumerate(df.columns[3:],1):
    vals = df.iloc[:,i][df[col]]
    if not vals.empty:
        ax.scatter(vals.index,vals.values,color='red',s=110,label='bad')

df.iloc[:,:3].plot(marker='o',ax=ax)  # plot the dataframe        

plt.legend(bbox_to_anchor=(1.05,1),loc='upper left')
plt.xticks(range(16))
plt.title('y-values plotted against rebinned x-values')
plt.grid()
plt.show()

这是我最初问这个问题时脑海中想得到的答案，但当时无法解决。我的想法是基于基于x的y1和y2的合并，并在每个bin中比较这两个。因此，作为示例，我有3条曲线，我想对其进行比较。这些曲线中唯一类似的东西是delta x（箱长），此处为1。

import numpy as np
import pandas as  pd
import matplotlib.pyplot as plt

#curve 1
x1 = np.array([5,6,7,8,9,10,11,12,13])
y1 = np.array([100,101,110,130,132,170,190,192,210])

#curve 2
x2 = np.array([3,4,5,12])
y2 = np.array([90,210,211,250,260,261,265,180,200,210])

#curve 3
x3 = np.array([7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3])
y3 = np.array([300,270,350,380,400,390,380])

bin_length = 1
# x values have same intervals both in x1 and x2

x_min = min(x1[0],x2[0],x3[0])-bin_length/2
x_max = max(x1[-1],x2[-1],x3[-1])+bin_length/2

bins = np.arange(x_min,x_max+bin_length,bin_length)

# bin mid points to use as index
bin_mid = []
for i in range(len(bins)-1):
    # compute mid point of the bins
    bin_mid.append((bins[i] + bins[i+1])/2)

# This function bins y based on binning x
def bin_fun(x,y,bin_length):
    c = list(zip(x,y))
    # define final out put of the function
    final_y_binning = []
    # define a list for holding members of each bin
    bined_y_members = []
    # compute length of each bin

    for i in range(len(bins)-1):
        # compute high and low threshold of the bins
        low_threshold = bins[i]
        high_threshold = bins[i+1]

        # bin y according to x
        for member in c:
            if (member[0] < high_threshold and member[0] >= low_threshold):
                bined_y_members.append(member[1])
                
        final_y_binning.append(bined_y_members)
        # fill out the container of the bin members

        bined_y_members=[]

        df = pd.DataFrame(final_y_binning)
    return(df)


binned_y =pd.DataFrame(columns=[1,2,3])

Y1 = bin_fun(x1,y1,bin_length)
Y1.columns =[1]

Y2 = bin_fun(x2,bin_length)
Y2.columns =[2]

Y3 = bin_fun(x3,y3,bin_length)
Y3.columns =[3]

binned_y = binned_y.append(Y1)
binned_y[2] = Y2
binned_y[3] = Y3

binned_y.index = bin_mid

print(binned_y)

# comparing curve 2 and curve 1
for i in binned_y.index:
    if (binned_y.loc[i][2]-binned_y.loc[i][1]<0):
        print(i)

 # comparing curve 3 and curve 2
for i in binned_y.index:
    if (binned_y.loc[i][3]-binned_y.loc[i][2]<0):
        print(i)

这将返回8，这是y3 的索引

`binned_y`

          1      2      3
3.0     NaN   90.0    NaN
4.0     NaN  210.0    NaN
5.0   100.0  211.0    NaN
6.0   101.0  250.0    NaN
7.0   110.0  260.0  300.0
8.0   130.0  261.0  250.0
9.0   132.0  265.0  270.0
10.0  170.0  180.0  350.0
11.0  190.0  200.0  380.0
12.0  192.0  210.0  400.0
13.0  210.0    NaN  390.0
14.0    NaN    NaN  380.0
15.0    NaN    NaN    NaN

情节

binned_y.plot(marker='o',figsize=(6,6))  # plot the dataframe
plt.legend(labels=['y1','y2',bbox_to_anchor=(1.05,loc='upper left')
plt.xticks(range(16))
plt.grid()

compare numpy pandas python

对于给定的bin，如何确定一个数组中的任何值是否低于另一个数组中的任何值？

问题描述

情节

解决方法

功能

调用函数：

display（df）

情节

binned_y

情节

`binned_y`