推荐经典算法实现之DMF(tensorflow+MovieLen)

it2022-05-05  100

# -*- Encoding:UTF-8 -*- ''' @author: Jason.F @data: 2019.07.17 @function: Implementing DMF with Tensorflow Dataset: Movielen-1m Evaluating: hitradio,ndcg https://www.ijcai.org/proceedings/2017/0447.pdf https://github.com/RuidongZ/Deep_Matrix_Factorization_Models ''' import tensorflow as tf import numpy as np import argparse import os import heapq import math import sys class DMF: def __init__(self, K, negNum=4, lr=0.001, maxEpochs=20, topK=10): #prepare data self.dataSet = DataSet() self.shape = self.dataSet.shape self.maxRate = self.dataSet.maxRate self.train = self.dataSet.train self.testNeg = self.dataSet.getTestNeg() #initiate model self.negNum = negNum self.add_embedding_matrix() self.add_placeholders() self.userLayer = [512, K] self.itemLayer = [512, K] self.add_model() self.add_loss() self.lr = lr self.add_train_step() self.init_sess() self.maxEpochs = maxEpochs self.batchSize = 256 self.topK = topK self.earlyStop = 5 def add_placeholders(self): self.user = tf.placeholder(tf.int32) self.item = tf.placeholder(tf.int32) self.rate = tf.placeholder(tf.float32) self.drop = tf.placeholder(tf.float32) def add_embedding_matrix(self): self.user_item_embedding = tf.convert_to_tensor(self.dataSet.getEmbedding()) self.item_user_embedding = tf.transpose(self.user_item_embedding) def add_model(self): user_input = tf.nn.embedding_lookup(self.user_item_embedding, self.user) item_input = tf.nn.embedding_lookup(self.item_user_embedding, self.item) def init_variable(shape, name): return tf.Variable(tf.truncated_normal(shape=shape, dtype=tf.float32, stddev=0.01), name=name) with tf.name_scope("User_Layer"): user_W1 = init_variable([self.shape[1], self.userLayer[0]], "user_W1") user_out = tf.matmul(user_input, user_W1) for i in range(0, len(self.userLayer)-1): W = init_variable([self.userLayer[i], self.userLayer[i+1]], "user_W"+str(i+2)) b = init_variable([self.userLayer[i+1]], "user_b"+str(i+2)) user_out = tf.nn.relu(tf.add(tf.matmul(user_out, W), b)) with tf.name_scope("Item_Layer"): item_W1 = init_variable([self.shape[0], self.itemLayer[0]], "item_W1") item_out = tf.matmul(item_input, item_W1) for i in range(0, len(self.itemLayer)-1): W = init_variable([self.itemLayer[i], self.itemLayer[i+1]], "item_W"+str(i+2)) b = init_variable([self.itemLayer[i+1]], "item_b"+str(i+2)) item_out = tf.nn.relu(tf.add(tf.matmul(item_out, W), b)) norm_user_output = tf.sqrt(tf.reduce_sum(tf.square(user_out), axis=1)) norm_item_output = tf.sqrt(tf.reduce_sum(tf.square(item_out), axis=1)) self.y_ = tf.reduce_sum(tf.multiply(user_out, item_out), axis=1, keepdims=False) / (norm_item_output* norm_user_output) self.y_ = tf.maximum(1e-6, self.y_) def add_loss(self): regRate = self.rate / self.maxRate losses = regRate * tf.log(self.y_) + (1 - regRate) * tf.log(1 - self.y_) loss = -tf.reduce_sum(losses) # regLoss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # self.loss = loss + self.reg * regLoss self.loss = loss def add_train_step(self): ''' global_step = tf.Variable(0, name='global_step', trainable=False) self.lr = tf.train.exponential_decay(self.lr, global_step, self.decay_steps, self.decay_rate, staircase=True) ''' optimizer = tf.train.AdamOptimizer(self.lr) self.train_step = optimizer.minimize(self.loss) def init_sess(self): self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.config.allow_soft_placement = True self.sess = tf.Session(config=self.config) self.sess.run(tf.global_variables_initializer()) def run(self): best_hr = -1 best_NDCG = -1 best_epoch = -1 print("Start Training!") for epoch in range(self.maxEpochs): print("="*20+"Epoch ", epoch, "="*20) self.run_epoch(self.sess) print('='*50) print("Start Evaluation!") hr, NDCG = self.evaluate(self.sess, self.topK) print("Epoch ", epoch, "HR: {}, NDCG: {}".format(hr, NDCG)) if hr > best_hr or NDCG > best_NDCG: best_hr = hr best_NDCG = NDCG best_epoch = epoch if epoch - best_epoch > self.earlyStop: print("Normal Early stop!") break print("="*20+"Epoch ", epoch, "End"+"="*20) print("Best hr: {}, NDCG: {}, At Epoch {}".format(best_hr, best_NDCG, best_epoch)) print("Training complete!") def run_epoch(self, sess, verbose=10): train_u, train_i, train_r = self.dataSet.getInstances(self.train, self.negNum) train_len = len(train_u) shuffled_idx = np.random.permutation(np.arange(train_len)) train_u = train_u[shuffled_idx] train_i = train_i[shuffled_idx] train_r = train_r[shuffled_idx] num_batches = len(train_u) // self.batchSize + 1 losses = [] for i in range(num_batches): min_idx = i * self.batchSize max_idx = np.min([train_len, (i+1)*self.batchSize]) train_u_batch = train_u[min_idx: max_idx] train_i_batch = train_i[min_idx: max_idx] train_r_batch = train_r[min_idx: max_idx] feed_dict = self.create_feed_dict(train_u_batch, train_i_batch, train_r_batch) _, tmp_loss = sess.run([self.train_step, self.loss], feed_dict=feed_dict) losses.append(tmp_loss) if verbose and i % verbose == 0: sys.stdout.write('\r{} / {} : loss = {}'.format(i, num_batches, np.mean(losses[-verbose:]))) sys.stdout.flush() loss = np.mean(losses) print("\nMean loss in this epoch is: {}".format(loss)) return loss def create_feed_dict(self, u, i, r=None, drop=None): return {self.user: u, self.item: i, self.rate: r, self.drop: drop} def evaluate(self, sess, topK): def getHitRatio(ranklist, targetItem): for item in ranklist: if item == targetItem: return 1 return 0 def getNDCG(ranklist, targetItem): for i in range(len(ranklist)): item = ranklist[i] if item == targetItem: return math.log(2) / math.log(i+2) return 0 hr =[] NDCG = [] testUser = self.testNeg[0] testItem = self.testNeg[1] for i in range(len(testUser)): target = testItem[i][0] feed_dict = self.create_feed_dict(testUser[i], testItem[i]) predict = sess.run(self.y_, feed_dict=feed_dict) item_score_dict = {} for j in range(len(testItem[i])): item = testItem[i][j] item_score_dict[item] = predict[j] ranklist = heapq.nlargest(topK, item_score_dict, key=item_score_dict.get) tmp_hr = getHitRatio(ranklist, target) tmp_NDCG = getNDCG(ranklist, target) hr.append(tmp_hr) NDCG.append(tmp_NDCG) return np.mean(hr), np.mean(NDCG) class DataSet(object): def __init__(self): self.train, self.shape = self.getTrainData() self.trainDict = self.getTrainDict() def getTrainData(self): data = [] filePath = '/data/fjsdata/ctKngBase/ml/ml-1m.train.rating' u = 0 i = 0 maxr = 0.0 with open(filePath, 'r') as f: for line in f: if line: lines = line[:-1].split("\t") user = int(lines[0]) movie = int(lines[1]) score = float(lines[2]) data.append((user, movie, score)) if user > u:u = user if movie > i:i = movie if score > maxr:maxr = score self.maxRate = maxr print("Loading Success!\n" "Data Info:\n" "\tUser Num: {}\n" "\tItem Num: {}\n" "\tData Size: {}".format(u, i, len(data))) return data, [u+1, i+1] def getTrainDict(self): dataDict = {} for i in self.train: dataDict[(i[0], i[1])] = i[2] return dataDict def getEmbedding(self): train_matrix = np.zeros([self.shape[0], self.shape[1]], dtype=np.float32) for i in self.train: user = i[0] movie = i[1] rating = i[2] train_matrix[user][movie] = rating return np.array(train_matrix) def getInstances(self, data, negNum): user = [] item = [] rate = [] for i in data: user.append(i[0]) item.append(i[1]) rate.append(i[2]) for t in range(negNum): j = np.random.randint(self.shape[1]) while (i[0], j) in self.trainDict: j = np.random.randint(self.shape[1]) user.append(i[0]) item.append(j) rate.append(0.0) return np.array(user), np.array(item), np.array(rate) def getTestNeg(self): #loading data testset = [] filePath = '/data/fjsdata/ctKngBase/ml/ml-1m.test.negative' with open(filePath, 'r') as fd: line = fd.readline() while line != None and line != '': arr = line.split('\t') u = eval(arr[0])[0] testset.append([u, eval(arr[0])[1]])#one postive item for i in arr[1:]: testset.append([u, int(i)]) #99 negative items line = fd.readline() #format user = [] item = [] u_prev = testset[0][0] tmp_user = [] tmp_item = [] for u, i in testset: if u_prev ==u: tmp_user.append(u) tmp_item.append(i) else: user.append(tmp_user) item.append(tmp_item) tmp_user = [] tmp_item = [] tmp_user.append(u) tmp_item.append(i) u_prev = u return [np.array(user), np.array(item)] if __name__ == '__main__': for K in [64, 32, 16, 8]: dmf = DMF(K, negNum=4, lr=0.001, maxEpochs=2, topK=10) dmf.run()

复现论文的源码,发现两个问题:

1、当隐因子等于8和16时,self.y_ = tf.maximum(1e-6, self.y_),self.y_一直是le-6的值,调试很久没找到问题,最后得到结果是hr和ndcg都是1.0。作者在论文中是有因子等于8和16是有 试验结果,待咨询论文作者。

请关注:https://github.com/RuidongZ/Deep_Matrix_Factorization_Models/issues/6

2、当数据集的矩阵过大时,超过2G,需要做调整,参考:

https://blog.csdn.net/fjssharpsword/article/details/96431553

也向论文作者提出咨询:

https://github.com/RuidongZ/Deep_Matrix_Factorization_Models/issues/7


最新回复(0)