LightFm有两种预测方法:predict()和predict_rank()。评价函数precision_at_k是以predict_rank函数为基础的。由于我为每个用户排序了许多项,所以predict方法更适合/更快。因此,我尝试使用预测方法复制precision_at_k方法产生的精度@k分数。
显然,无论是使用predict_rank还是预测,都不应更改精度@k评分,但我无法用predict方法复制从precision_at_k (基于predict_rank)获得的分数。
事实上,predict方法的评价分数总是比包中包含的precision_at_k方法得出的评价分数差。为什么会这样呢?
下面是一个使用开源数据的示例。为了简单起见,我只使用了一小部分数据,这是一个没有特性的基本模型,已知的正项不会被删除(train_data参数没有在precision_at_k中指定)。
为什么这很重要:我想计算ndcg来进行评估,如果我能够复制prec@k评分和预测,我知道预测的后处理是正确的,我只需要改变度量。
from lightfm import LightFM
from scipy.sparse import coo_matrix as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
import csv
import requests
import json
from itertools import islice
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split
######################################
#
# Fetching the training data
#
######################################
def _download(url: str, dest_path: str):
req = requests.get(url, stream=True)
req.raise_for_status()
with open(dest_path, "wb") as fd:
for chunk in req.iter_content(chunk_size=2 ** 20):
fd.write(chunk)
def get_data():
ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")
if not os.path.exists("data"):
os.makedirs("data")
_download(ratings_url, "data/data.zip")
with zipfile.ZipFile("data/data.zip") as archive:
return (
csv.DictReader(
(x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
delimiter=";",
),
csv.DictReader(
(x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
),
csv.DictReader(
(x.decode("utf-8", "ignore") for x in archive.open("BX-Users.csv")), delimiter=";"
),
)
def get_ratings():
return get_data()[0]
def get_book_features():
return get_data()[1]
def get_user_features():
return get_data()[2]
# small dataset
udf = pd.DataFrame([x['User-ID'] for x in get_ratings()])
iid = pd.DataFrame([x['ISBN'] for x in get_ratings()])
frames = [udf, iid]
# susample user list
user_set = set([x['User-ID'] for x in get_ratings()])
user_samples = list(user_set)[:800]
train_df = pd.concat(frames, axis=1)
train_df.columns = ['user_id','item_id']
print(train_df.shape)
train_df = train_df[train_df.user_id.isin(user_samples)]
print(train_df.shape)
book_features = [(x['ISBN'], [x['Book-Author']]) for x in get_book_features() if x['ISBN'] in train_df.item_id.unique().tolist()]
user_features = [(x['User-ID'], [x['Age']]) for x in get_user_features() if x['User-ID'] in train_df.user_id.unique().tolist()]
dataset = Dataset()
dataset.fit(train_df.user_id.tolist(),
train_df.item_id.tolist())
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
dataset.fit_partial(users=train_df.user_id.tolist(),
items=train_df.item_id.tolist(),
item_features=[j[0] for i,j in book_features],
user_features=[j[0] for i,j in user_features])
#######################
#
# Building the Model
#
######################
dataset = Dataset()
dataset.fit(train_df.user_id.unique().tolist(),
train_df.item_id.unique().tolist())
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
dataset.fit_partial(users=train_df.user_id.unique().tolist(),
items=train_df.item_id.unique().tolist(),
item_features=[j[0] for i,j in book_features],
user_features=[j[0] for i,j in user_features])
(interactions, weights) = dataset.build_interactions(((i,j) for i,j in zip(train_df.user_id, train_df.item_id)))
print(repr(interactions))
(train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2)
item_features = dataset.build_item_features((book_features))
print(repr(item_features))
user_features1 = dataset.build_user_features((user_features))
print(repr(user_features1))
mapp = dataset.mapping()
dict_user_id = mapp[0]
dict_item_id = mapp[2]
user_list = list(dict_user_id.keys())
items_list = list(dict_item_id.keys())
items =np.array(items_list)
data = {
'train_cols': items,
"train": train,
'test_cols': items,
"test": test,
"item_features": item_features,
"user_features": user_features1
}
#############################
#
# Training the Model
#
#############################
model = LightFM(loss='warp')
model.fit(data['train'],
#item_features=data['item_features'],
#user_features=data['user_features']
)
### model performnce evaluation
pak = precision_at_k(model,
test_interactions = data['test'],
#train_interactions = data['train'],
#item_features=data['item_features'],
#user_features=data['user_features']
).mean()
print("precision@10 : {}".format(pak))这给出了精确度@10: 0.004322766792029142。在外壳下,精度@k使用predict_rank方法生成精度@k,如下所示:
ranks = model.predict_rank(test_interactions=data['test'],
#train_interactions=data['train'],
#item_features=data['item_features'],
#user_features=data['user_features'],
num_threads=32,
check_intersections=True)
ranks.data = np.less(ranks.data, 10, ranks.data)
precision = np.squeeze(np.array(ranks.sum(axis=1))) / 10
precision = precision[data['test'].getnnz(axis=1) > 0]
print('prec@10: {}'.format(precision.mean()))只是为了证明这给出了精确度@10: 0.004322766792029142。
如果现在使用预测方法复制精度@k,则会得到不同的结果。
############################################
#
# Replicate precision using the predict method
#
############################################
mapp = dataset.mapping()
dict_user_id = mapp[0]
dict_item_id = mapp[2]
d_user_pred = {}
for user in dict_user_id.keys():
d_user_pred[user] = []
for uid, i in dict_user_id.items():
known_positives_ids = data['train_cols'][data['train'].tocsr()[i].indices]
#print('known positives:{}'.format(known_positives_ids))
scores = model.predict(user_ids = i,
item_ids = np.arange(len(dict_item_id)),
#user_features=user_features,
#item_features=item_features
)
# get top recommendations
top_items_ids = data['train_cols'][np.argsort(-scores)]
# exclude known positives from recommendations
top_items_ids = np.array(list(set(top_items_ids) - set(known_positives_ids)))
print('top_items_ids:{}'.format(top_items_ids[:5]))
d_user_pred[uid] = top_items_ids##################################
#
# Precision@k evaluation
#
##################################
# get predictions df
df = pd.DataFrame.from_dict(d_user_pred, orient='index').iloc[:,:20]
df['user_id'] = df.index
df = df.melt(id_vars='user_id')
df.columns = ['user_id','rank','item_id']
pred_df = df.groupby('user_id').aggregate(lambda tdf: tdf.tolist()).reset_index()
pred_df.columns = ['user_id','rank','predictions']
# get ground truth df
t = pd.DataFrame(data['test'].todense(), columns=items_list)
t['user_id'] = user_list
t = t.melt(id_vars='user_id')
t = t[t.value==1].drop('value',axis=1)
t.columns = ['user_id','item_id']
actual_df = t.groupby('user_id').aggregate(lambda tdf: tdf.tolist()).reset_index()
actual_df.columns = ['user_id','actual']
# generate eval_df
eval_df = pred_df.merge(actual_df,on='user_id',how='left')
eval_df = eval_df[eval_df.actual.notnull()]
def precision(actual, predictions, k):
""" Fraction of retrieved documents @k that are relevant."""
return len(set(actual) & set(predictions[:k])) / k
eval_df['prec'] = eval_df.apply(lambda row : precision(actual=row['actual'],
predictions=row['predictions'],
k=10), axis = 1)
eval_df.prec.mean()相当于0.0005763688760806917。
总之,predict_rank给出了精度@k分数= 0.004322766792029142,预测方法给出了精度@k score=0.0005763688760806917。为什么会这样呢?
发布于 2020-11-26 14:50:48
使用
top_items_ids = [item_id for item_id in top_items_ids if item_id not in known_positives_ids]而不是
top_items_ids = np.array(list(set(top_items_ids) - set(known_positives_ids)))解决了这个矛盾。
https://datascience.stackexchange.com/questions/85451
复制相似问题