Wild Dataset中的SynthText有多少个字符数？

问题描述

我从official的Wild Dataset中下载了SynthText。

然后，我阅读了官方的readme.txt，但是我找不到数据集有多少个字符。我用谷歌搜索，但找不到它...

如下面的示例图像所示，存在一些符号，例如.，:和-。因此，此数据集包含字母（27）+数字（10）+一些符号（？）。

有人知道吗？

解决方法

我实现了自己的代码，可以对符号进行计数。

def get_characters(basedir,imagedirname='SynthText',skip_missing=False):

    class Symbols:
        def __init__(self):
            self.symbols = set()

        def update(self,data):
            self.symbols = self.symbols.union(data)

        def __len__(self):
            return len(self.symbols)

        def __str__(self):
            return ''.join(self.symbols)

    symbols = Symbols()

    def csvgenerator(annodir,imagedir,cbb,wBB,imname,txts,symbols,**kwargs):
        image_num = kwargs.get('image_num')
        i = kwargs.get('i')

        imgpath = os.path.join(imagedir,imname)

        img = cv2.imread(imgpath)
        h,w,_ = img.shape
        if not os.path.exists(imgpath):
            if not skip_missing:
                raise FileNotFoundError('{} was not found'.format(imgpath))
            else:
                logging.warning('Missing image: {}'.format(imgpath))
                raise _Skip()


        # convert txts to list of str
        # I don't know why txts is
        # ['Lines:\nI lost\nKevin ','will                ','line\nand            ',# 'and\nthe             ','(and                ','the\nout             ',# 'you                 ',"don't\n pkg          "]
        # there is strange blank and the length of txts is different from the one of wBB
        txts = ' '.join(txts.tolist()).split()
        text_num = len(txts)

        if wBB.ndim == 2:
            # convert shape=(2,4,) to (2,1)
            wBB = np.expand_dims(wBB,2)

        assert text_num == wBB.shape[2],'The length of text and wordBB must be same,but got {} and {}'.format(
            text_num,wBB.shape[2])

        # replace non-alphanumeric characters with *
        alltexts_asterisk = ''.join([re.sub(r'[^A-Za-z0-9]','*',text) for text in txts])
        assert len(alltexts_asterisk) == cbb.shape[
            2],'The length of characters and cbb must be same,but got {} and {}'.format(
            len(alltexts_asterisk),cbb.shape[2])
        for b in range(text_num):
            text = txts[b]

            symboltext = re.sub(r'[A-Za-z0-9]+','',text)

            symbols.update(symboltext)

        sys.stdout.write('\r{},and number is {}...{:0.1f}% ({}/{})'.format(symbols,len(symbols),100 * (float(i + 1) / image_num),i + 1,image_num))
        sys.stdout.flush()

    _gtmatRecognizer(csvgenerator,basedir,imagedirname,customLog=True,symbols=symbols)

    print()
    print('symbols are {},and number is {}'.format(symbols,len(symbols)))


def _gtmatRecognizer(generator,customLog=False,**kwargs):
    """
        convert gt.mat to https://github.com/MhLiao/TextBoxes_plusplus/blob/master/data/example.xml

        <annotation>
            <folder>train_images</folder>
            <filename>img_10.jpg</filename>
            <size>
                <width>1280</width>
                <height>720</height>
                <depth>3</depth>
            </size>
            <object>
                <difficult>1</difficult>
                <content>###</content>
                <name>text</name>
                <bndbox>
                    <x1>1011</x1>
                    <y1>157</y1>
                    <x2>1079</x2>
                    <y2>160</y2>
                    <x3>1076</x3>
                    <y3>173</y3>
                    <x4>1011</x4>
                    <y4>170</y4>
                    <xmin>1011</xmin>
                    <ymin>157</ymin>
                    <xmax>1079</xmax>
                    <ymax>173</ymax>
                </bndbox>
            </object>
            .
            .
            .

        </annotation>

        :param basedir: str,directory path under \'SynthText\'(,\'licence.txt\')
        :param imagedirname: (Optional) str,image directory name including \'gt.mat\
        :return:
        """
    logging.basicConfig(level=logging.INFO)

    imagedir = os.path.join(basedir,imagedirname)
    gtpath = os.path.join(imagedir,'gt.mat')

    annodir = os.path.join(basedir,'Annotations')

    if not os.path.exists(gtpath):
        raise FileNotFoundError('{} was not found'.format(gtpath))

    if not os.path.exists(annodir):
        # create Annotations directory
        os.mkdir(annodir)

    """
    ref: http://www.robots.ox.ac.uk/~vgg/data/scenetext/readme.txt
    gts = dict;
        __header__: bytes
        __version__: str
        __globals__: list
        charBB: object ndarray,shape = (1,image num). 
                Character level bounding box. shape = (2=(x,y),4=(top left,...: clockwise),BBox word num)
        wordBB: object ndarray,image num). 
                Word level bounding box. shape = (2=(x,BBox char num)
        imnames: object ndarray,image num,1).
        txt: object ndarray,shape = (i,image num).
             Text. shape = (word num)
    """
    logging.info('Loading {} now.\nIt may take a while.'.format(gtpath))
    gts = sio.loadmat(gtpath)
    logging.info('Loaded\n'.format(gtpath))

    charBB = gts['charBB'][0]
    wordBB = gts['wordBB'][0]
    imnames = gts['imnames'][0]
    texts = gts['txt'][0]

    image_num = imnames.size

    for i,(cbb,txts) in enumerate(zip(charBB,wordBB,imnames,texts)):
        imname = imname[0]

        try:
            generator(annodir,i=i,image_num=image_num,**kwargs)
        except _Skip:
            pass

        if not customLog:
            sys.stdout.write('\rGenerating... {:0.1f}% ({}/{})'.format(100 * (float(i + 1) / image_num),image_num))
        sys.stdout.flush()


    print()
    logging.info('Finished!!!')

最后，我得到了符号编号。看来ASCII printable characters没有空格。

INFO:root:Loading ~/data/text/SynthText/SynthText/gt.mat now.
It may take a while.
INFO:root:Loaded

}&|%_(],$^{+?#@/-`).<=;~['>:\!"*,and number is 32...100.0% (858750/858750)
INFO:root:Finished!!!

symbols are }&|%_(],and number is 32

dataset text-recognition