ValueError:无法在 Python 中同时指定 mapper_raw 和 mapper

问题描述

我正在尝试使用 Python 中的 fna 读取 mrjob 文件

这是我的 load_read.py 程序,所有代码都可以在不使用 mrjob 的情况下正常工作。

from mrjob.job import MRJob
from Bio import SeqIO
from Bio.Seq import Seq
import re
from operator import itemgetter 
import sys

def format_read(read):
    z = re.split('[|={,]+',read.description)
    return read.seq,z[3]

class LoadMetaRead(MRJob):

    def mapper_raw(self,file_path,file_uri):
        from Bio import SeqIO
        from Bio.Seq import Seq

        seqs = list(SeqIO.parse(file_path,type='fna'))

        is_paired_end = False
        if len(seqs) > 2 and seqs[0].id[-1:] != seqs[1].id[-1:]:
            is_paired_end = True

        label_list = dict()
        label_index = 0
        
        for i in range(0,len(seqs),2 if is_paired_end else 1):
            read,label = format_read(seqs[i])
            if is_paired_end:
                read2,_ = format_read(seqs[i + 1])
                read += read2
            
            if label not in label_list:
                label_list[label] = label_index
                label_index += 1

            yield str(i),str(read),str(label_list[label])            

    def mapper(self,_,line):
        yield 'read',line

    def reducer(self,key,values):
        yield key,values

    combiner = reducer

if __name__ == '__main__':
    LoadMetaRead.run()

数据文件R4.fna示例:

>r1.1 |SOURCES={GI=15668172,fw,1146130-1146958}|ERRORS={52_1:A,78_1:G,78_2:G,78_3:G,641_1:G}|SOURCE_1="Methanocaldococcus jannaschii DSM 2661 chromosome" (392b1054a4bf536ea1cc349545ace50120973c3a)
AAACCCTCTTCCACGAACCCTCTTGAAAATCCCCCACATCCACAAAATAAATCAAATAAATTTCA
ACATTATCACCAAAAGGGTAAAAGGTTATTTAAAAAATAAAATAAATTTAAAAATTTAAATTAAA
TACCAAAAAAGCCAAATAACTTATTGTGATTCTTGAGCTTTCTTTAACTCTGCCTTCATATCTTG
ATAGACTTTAGTCCATTTTAATTTTCTTGGATTTCTTCCCATTCTGTAGCTTTTCTCACATTTGG
ATGAGCAGAAATATAATACAGTCCCATCTTTTTCTACGACCATTTTTCCTTTTCCTGGCTCAATT
TCATAACCACAAAAGCTGCATGTTCTCCATTCTGGCATAGCTATCCCCCTTTAATAGTGTTTCAG
TGATTTTAAAATAATTTAAGATTAAATTATTTATCTTCTTCTGTCTAATGGTCTTGCTTCTCTCT
CTGTTTCTCTTAACATAATAATGTCTCCAACTTTAACTGGACCTTTAACGTTTCTAACTAAAACT
CTTCCAGTATCTTTTCCACCTAAGATTTTACATCTAACTTGTATAATTCCTCCAGTAACCCCTGT
TCTACCAATGACTTCAATAACTTCAGCAGCTACTGCTTCCTTATAAACAAATTCATCTTCCGATC
CTCATCACCTAATATTAATGAAGGTTTAAAATTTATAAAAAAGTTAGTAGTAGTGTTTCATAATT
TATATAATAATAACTATATACTATTGATTGATGGTTAAATAGCGTTCTAATAATTTACTGCTTCA
AAACATTTACCTTTTCAATTAATACCTTTAACTCTTCAGCATCTCCTTCGTTG
>r2.1 |SOURCES={GI=15668172,bw,239211-239971}|ERRORS={113:-,217_1:C,281_1:G,627_1:G,717_1:T}|SOURCE_1="Methanocaldococcus jannaschii DSM 2661 chromosome" (392b1054a4bf536ea1cc349545ace50120973c3a)
TAGCATGTAAATCCCTTATTTCTTAATTTCTCCCAGAATTATTTCTATTGCTTTATCAACTGCCT
TGGCAACCTCTTCAGACAACCCTGGTTTTATGTCTGGCATTGTAAATTTTTACCTTGACAACCAA
TAACCACGACTTCTATGCCTTTATTATGTAAATCTTTGAGAAATGGGGCTAATGGAACGTTATGG
GCATCGAAAGAATATTTTTTAACTATTCGGTAATTCATCAACATCTATCTTTTTTATTGTTCCAG
GTTCTAAATCAAAATCAATGGCGATCAACAACAATAATCTTTTTTATATCTTCATCAACCAACGT
CATTAAATAGTATGCTCCACTTGCCCCAGCATCTATAACTTCAACGTTATCTGGCAAGTTCATTT
TTTCTAATTTGCTAACAACCTCACATCCAAAGCCATCATCTCCAAACAACAGATTTCCACAACCA
ACAATTAATATATCCTTCTTTTTCATTTTATCACTTATTTAGCATTTCTTTATATTTTTTAGCCT
CTTCTTTAGGATTTTGTGATTGATAGATTGCCCTTCCAACAATGACGTAATCATTCTCATCTAAA
ATATTTAAAATATCCTCAATCTTCCCTCCCTGAGCTCCGACTCCGTGGTGTTATTACTGGCAATT
CTGCAATTTCTTTAATTTCTTTAAGCCTTTCAGGCCTTGTTGATGGAGCAACTATAGCATCAACT
TTTAGTTTTTTTAGCCATCTCTGACAATTTATCTGCTATTGGCTGTAG

当我用这个命令运行程序时:

python load_read.py R4.fna

它引发了这个错误

ValueError: Can't specify both mapper_raw and mapper

你知道如何解决这个问题吗?

解决方法

所以我发现我不能同时定义 mapper_raw()mapper。我只需要定义其中之一。 我使用 mapper_raw() 是因为我读取了整个文件,而不是一行一行。

class LoadMetaRead(MRJob):

    def mapper_raw(self,file_path,file_uri):
        from Bio import SeqIO
        from Bio.Seq import Seq

        seqs = list(SeqIO.parse(file_path,'fasta'))

        is_paired_end = False
        if len(seqs) > 2 and seqs[0].id[-1:] != seqs[1].id[-1:]:
            is_paired_end = True

        label_list = dict()
        label_index = 0
        
        for i in range(0,len(seqs),2 if is_paired_end else 1):
            read,label = format_read(seqs[i])
            if is_paired_end:
                read2,_ = format_read(seqs[i + 1])
                read += read2
            
            if label not in label_list:
                label_list[label] = label_index
                label_index += 1

            yield None,(str(read),str(label_list[label]))

    def reducer(self,key,values):
        for value in values:
            yield key,str(value)

此代码按预期工作。