问题描述
import numpy as np
from numpy.lib.stride_tricks import as_strided
from typing import Sequence,Optional
def dummy_broadcast_strides(shape: Sequence[int],to: Sequence[int],dtype: np.dtype,strides: Optional[Sequence[int]]=None):
# Create dummy array that only allocates one byte of space. A bool only takes a 1 byte of memory.
# Meaning strides will be scaled by 1,must scale with appropriate dtype later.
dummy = as_strided(True,shape=shape,strides=strides,writeable=False)
# Multiply with dtype.itemsize to scale strides to dtype
return np.array(np.broadcast_to(dummy,to).strides) * dtype.itemsize
if __name__ == '__main__':
shape = (1,5,1,5) # shape of 'a' in your example
print(dummy_broadcast_strides(shape,(2,5),np.float32())) # Must instantiate the dtype
# [ 0 20 0 0 4]
# Try really "big" array
shape = (1024,1024,1024) # shape of 'a' in your example
print(dummy_broadcast_strides(shape,1024),np.float32())) # Must instantiate the dtype
# [ 0 4503599627370496 4398046511104 4294967296 4194304 4096 4]
数据帧的输出是..
import pandas as pd
import re
from collections import defaultdict
d = defaultdict(list)
df = pd.read_csv('https://raw.githubusercontent.com/twittergithub/hello/main/category_app_id_text_1_month_march_2021%20(1).csv')
如果要为每一行和每个类别内的类别列表中存在的每个项目创建类别列字典,从建议列创建一个建议字典,如果建议或类别重复,则只需增加里面的计数器字典。
suggestions category
0 ['jio tv','jio','jiosaavn','jiomart','jio ... ['BOOKS_AND_REFERENCE','PRODUCTIVITY','MUSIC...
1 ['instagram','internet','instacart','instag... ['SOCIAL','COMMUNICATION','FOOD_AND_DRINK',...
2 ['instagram','instagram download... ['SOCIAL','VIDEO_PLAYERS',...
3 ['vpn','vpn free','vpn master','vpn private... ['TOOLS','TOOLS',...
4 ['pubg','pubg mobile lite','pubg lite','pub... ['GAME_ACTION','GAME_ACTION','GAME_...
... ...
...
49610 ['inbuilt camera app','inbuilt screen recorde... ['PHOTOGRAPHY','PRO...
49611 ['mpsc science app in marathi','mpsc science ... ['EDUCATION','EDUCATION','EDUCA...
49612 ['ryerson','ryerson university','ryerson mob... ['BOOKS_AND_REFERENCE','EDUCATIO...
49613 ['eeze','eezee english','ezee tab','deezer'... ['TRAVEL_AND_LOCAL','BUSInesS',...
49614 ['hindi love story books free download','hind... ['BOOKS_AND_REFERENCE','BOOKS_AND_REFERENCE',...
但是我在 defaultdict 中的类别列表中得到了空列表。我希望你能理解我的问题。
解决方法
使用 pandas
可能更容易、更快捷:
from ast import literal_eval
# create cartesian product of categories and suggestions for each record,# and calculate value_counts
z = pd.merge(
df['category'].apply(literal_eval).explode(),df['suggestions'].apply(literal_eval).explode(),left_index=True,right_index=True).value_counts()
# convert to nested dict
d = {l: z.xs(l).to_dict() for l in z.index.levels[0]}
d
输出:
{'ART_AND_DESIGN': {'flipaclip': 39,'mehndi design': 28,'ibis paint x': 22,'u launcher lite': 21,'poster maker': 20,'poster maker design app free': 20,'ibis paint': 18,'mehndi design 2021': 18,'mehandi ka design': 18,'u launcher': 18,...
话虽如此,如果您想采用原始方法,您需要解决的就是将 dictionary
声明为 defaultdict(dict)
而不是 defaultdict(list)
,因为您正在嵌套字典,而不是列表字典:
dictionary = defaultdict(dict)
for i in range(df.shape[0]):
...