以ON时间排序可变长度的字符串

问题描述

我正在尝试一种算法,以O(N)时间对字符串列表进行排序,我认为我的方法有点类似于Radix Sort。但是,我没有尝试获取排序列表,而是试图像这样获取“排序索引”:

输入:[“ AT#”,“ AC#”,“ A#”]

输出:[2,1,0]

但是,我的算法输出[1,1,0]好像它认为“ AT#”和“ AC#”相同

这是我尝试过的代码

def two_key_count_sort(L,longest_word,sigma = 6):
    '''
    Sorting a two key value list L based on counting sort
    :param L: list containing triplets (p,s,v) (primary key,secondary key and value)
    :param longest_word: length of the longest word in the sorting
    :param sigma : Alphabet Size,default is 6 indicating #,A,C,N,G,T
    :return: a sorted list L'
    '''

    # Generate Count^p and B^p
    count_p = [0 for i in range(longest_word)]
    projection_dic = {"#":0,"A":1,"C":2,"N":3,"G":4,"T":5}

    primary_string = True
    for i in L:
        primary_key = i[0]
        if isinstance(primary_key,str):
            count_p[projection_dic[primary_key]] += 1
        else:
            count_p[primary_key] += 1
            primary_string = False

    b_p = [0 for i in range(longest_word)]
    for i in range(1,len(count_p)):
        b_p[i] = b_p[i - 1] + count_p[i - 1]

    # Sorting L by secondary key
        # Generate Count^s and B^s
    count_s = [0 for i in range(sigma)]

    seconday_string = True
    for i in L:
        secondary_key = i[1]
        if isinstance(secondary_key,str):
            count_s[projection_dic[secondary_key]] += 1
        else:
            count_s[secondary_key] += 1
            seconday_string = False

    b_s = [0 for i in range(sigma)]
    for i in range(1,len(count_s)):
        b_s[i] = b_s[i - 1] + count_s[i - 1]

        #Calculate L_s -> L sorted by secondary key

    L_s = [(None,None,None) for i in range(len(L))]
    for i in range(len(L)):
        if seconday_string:
            L_s[b_s[projection_dic[L[i][1]]]] = L[i]
            b_s[projection_dic[L[i][1]]] += 1
        else:
            L_s[b_s[L[i][1]]] = L[i]
            b_s[L[i][1]] += 1

    # Compute L' Using L_s and B^p
    L_prime = [(None,None) for i in range(len(L_s))]
    for k in range(len(L_s)):
        if not primary_string:
            L_prime[b_p[L_s[k][0]]] = L_s[k]
            b_p[L_s[k][0]] += 1
        else:
            L_prime[b_p[projection_dic[L_s[k][0]]]] = L_s[k]
            b_p[projection_dic[L_s[k][0]]] += 1

    return L_prime

def sort_variable_length_string(S,sigma = 6):
    '''
    Sorting variable length strings in L in linear time
    :param S: list of strings to be sroted
    :param sigma: alphabet size of L,T
    :return: sorted strings
    '''

    # get the largest length in string L
    longest_word = max([len(i) for i in S])
    n = len(S)
    N = sum([len(i) for i in S])

    # Initialization (p = 1)
    # Generate a block pointer array B[1 .. n]
    count = [0 for i in range(sigma)]
    projection_dic = {"#": 0,"A": 1,"C": 2,"N": 3,"G": 4,"T": 5}

    for i in range(n):
        count[projection_dic[S[i][0]]] += 1

    B = [0 for i in range(n)]
    for i in range(n):
        index = projection_dic[S[i][0]]
        for j in range(index):
            B[i] += count[j]

    # Generate list L and sort it in linear time
    L = [(None,None) for i in range(N)]
    L_index = 0
    for i in range(len(S)):
        for j in range(len(S[i])):
            L[L_index] = (j,S[i][j],i)
            L_index += 1

    L_prime = two_key_count_sort(L,longest_word)

    L_i = [[] for i in range(longest_word)]
    L_i_index = 0
    for i in L_prime:
        if i[0] == L_i_index:
            L_i[L_i_index].append(i)
        else:
            L_i_index += 1
            L_i[L_i_index].append(i)

    # Update/Refinement (p > 1)
        # Initialize Q in O(|L^p|) time using two scans over L^p
    for p in range(1,longest_word):
        T_k = [0 for i in range(len(L_i[p]))]
        for k in range(len(L_i[p])):
            T_k[k] = B[L_i[p][k][2]]
        # Initialize Q and C
        Q = [None for i in range(n)]
        C = [None for i in range(n)]
        for i in T_k:
            Q[i] = -1
        for k in range(len(L_i[p])):
            if Q[T_k[k]] == -1:
                Q[T_k[k]] = T_k[k]
                C[T_k[k]] = L_i[p][k][1]

        # Update
        S = [0 for i in range(n)]
        for k in range(len(L_i[p])):
            if C[T_k[k]] == L_i[p][k][1]:
                B[L_i[p][k][2]] = Q[T_k[k]]
                S[T_k[k]] += 1
            else:
                Q[T_k[k]] += S[T_k[k]]
                S[T_k[k]] = 0
                C[T_k[k]] = L_i[p][k][1]
                B[L_i[p][k][2]] = Q[T_k[k]]

    return B

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)