问题描述
我正在对colab进行一些n-gram建模。代码中没有任何内容可以保存任何内容到colab的磁盘上,但是在编写本文时,代码已填满了10 gb以上的磁盘空间。我不明白为什么要使用额外的磁盘空间,确定只应该使用内存?
这是代码。我真的很愚蠢,缺少明显的东西吗?发生了什么事?:
drive.mount('/content/drive')
i_path = '/content/drive/My Drive/corpora/reddit_comments/preprocessed'
r_path = '/content/drive/My Drive/corpora/reddit_comments/results'
def n_grams(sentence,n):
n_grams = []
sentence = sentence.split()
n_grams = [sentence[i:i+n] for i,_ in enumerate(sentence) if i <= len(sentence) - n]
return n_grams
#for some date,return Pr(sequence)
def seq_prob(sequence,n,date,i_path):
pattern = '[^A-Za-z0-9 ]+'
if n > len(sequence.split()):
n = len(sequence.split())
count1 = {}
count2 = {}
sequence = sequence.lower()
seq_ngrams = n_grams(sequence,n)
# print(seq_ngrams)
# print('\n')
for n_gram in seq_ngrams:
# print(n_gram)
count1[str(n_gram)] = 0
count2[str(n_gram[:-1])] = 0
for file in os.listdir(i_path):
if date in file:
with open(i_path + '/' + file,'rb') as f:
sentences = pickle.load(f)
sentences = [re.sub(pattern,'',sentence).lower() for sentence in sentences]
for sentence in sentences:
multiplier = n - 1
sentence = multiplier * ' <s> ' + sentence + multiplier * ' </s> '
sent_ngrams = n_grams(sentence,n)
for n_gram in seq_ngrams:
if n_gram in sent_ngrams:
k = sent_ngrams.count(n_gram)
count1[str(n_gram)] += k
if ' '.join(n_gram[:-1]) in sentence:
k = sentence.count(' '.join(n_gram[:-1]))
count2[str(n_gram[:-1])] += k
print('\n')
print(count1)
print(count2)
print('\n')
if 0 in count2.values():
seq_prob = 0
else:
c_probs = [count1[str(n_gram)] / count2[str(n_gram[:-1])] for n_gram in seq_ngrams]
seq_prob = reduce(lambda x,y: x*y,c_probs)
return seq_prob
dates = sorted(list(set([file.split('_')[0] for file in os.listdir(i_path)])))
seq_prob_d = {}
sequence = 'austerity measures'
for date in dates:
print('Analysing ' + date + '...')
seq_prob_d[date] = seq_prob(sequence,2,i_path)
print(seq_prob_d)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)