推荐系统的召回率和精度评估:精度编码部分的错误

问题描述

我的问题是“我想将召回代码转换为精确代码”。 (回忆部分)这是我从 kaggle 得到的所有评估模型代码

#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INteraCTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self,person_id,sample_size,seed=42):
        interacted_items = get_items_interacted(person_id,interactions_full_indexed_df)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items,sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self,item_id,recommended_items,topn):        
            try:
                index = next(i for i,c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0,topn))
            return hit,index

    def evaluate_model_for_user(self,model,person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id,items_to_ignore=get_items_interacted(person_id,interactions_train_indexed_df),topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id,sample_size=EVAL_RANDOM_SAMPLE_NON_INteraCTED_ITEMS,seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5,index_at_5 = self._verify_hit_top_n(item_id,valid_recs,5)
            hits_at_5_count += hit_at_5
            hit_at_10,index_at_10 = self._verify_hit_top_n(item_id,10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items,#when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count,'hits@10_count':hits_at_10_count,'interacted_count': interacted_items_count_testset,'recall@5': recall_at_5,'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self,model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx,person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model,person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count',ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),'recall@5': global_recall_at_5,'recall@10': global_recall_at_10}    
        return global_metrics,detailed_results_df
    
model_evaluator = ModelEvaluator()  

(精准部分)那我试试这个。结果是一个全局精度值(整体)超过 1.00

#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INteraCTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self,10)
            hits_at_10_count += hit_at_10

        #precision is the rate of the interacted items that are ranked among the Top-N recommended items,#when mixed with a set of non-relevant items
        precision_at_5 = hits_at_5_count / 5
        precision_at_10 = hits_at_10_count / 10

        person_metrics = {'hits@5_count':hits_at_5_count,'precision@5': precision_at_5,'precision@10': precision_at_10}
        return person_metrics

    def evaluate_model(self,person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics)
                           
        
        global_precision_at_5 = detailed_results_df['hits@5_count'].sum() / 5
        global_precision_at_10 = detailed_results_df['hits@10_count'].sum() / 10
        
        global_metrics = {'modelName': model.get_model_name(),'precision@5': global_precision_at_5,'precision@10': global_precision_at_10}    
        return global_metrics,detailed_results_df
    
model_evaluator = ModelEvaluator() 

我想,问题就在这里(我应该在分隔符处迭代 K: 5 和 10,但我不知道该怎么做)

 detailed_results_df = pd.DataFrame(people_metrics)
                           
        
        global_precision_at_5 = detailed_results_df['hits@5_count'].sum() / 5
        global_precision_at_10 = detailed_results_df['hits@10_count'].sum() /  10 

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)