带有案例类评级的 ALS 训练数据

问题描述

我正在使用 Amazon consumer reviews dataset。我的目标是应用协同过滤。我成功地将自己添加为用户并添加了用户评分。

我想创建一个模型。我想使用 ALS，但我对 ALS.train() 有问题，因为我没有使用默认评级（Int、Int、Double）； case class rating (String,String,Int) 代替。我尝试将 String 值转换为 Int 并将 rating 值转换为 Double 但在将 userID 转换为 Int 时遇到问题，因为 Amazon 的 userID 类似于 "AVpgNzjwLJeJML43Kpxn" 而 prodcutID 类似于 "B00QWO9P0O,B00LH3DMUO" （包括" "）。如何克服这个问题？

CollabarativeFiltering 的代码：

 object CollabarativeFiltering {
case class Product(prooductID: String,prodcutName: String,productCat: String)

def parseProduct(fields: Row): Product = {
        //4,3,7,6
        Product(fields(4).toString(),fields(3).toString(),fields(5).toString())

}
def readProduct(location:String,spark: SparkSession): RDD[Product] = {
        val product = spark.read.option("header","true").csv(location).rdd.map(parseProduct)
                return product
}

def topRatedProducts(products : RDD[Product],ratings : RDD[User_ratings.rating],i: Int): Map[ Int,String] = {
        // Create mostRatedProducts(productID,Number_of_Product)
        val mostRatedProducts = ratings.groupBy(_.productID).map(f=> (f._1,f._2.size)).takeOrdered(100)(Ordering[Int].reverse.on(_._2))

                // Select 100 of the top rated Products
                val selectedProdcut = shuffle(mostRatedProducts).map(f => (f._2,products.filter(_.prooductID == f._1)
                        .map(p => p.prodcutName )
                        .take(1)(0) ) ).take(i).toMap
                return selectedProdcut
}

def getratings(topRatedProduct: Map[Int,String],spark: SparkSession): RDD[User_ratings.rating] = {
        var ourId = "A/E"
                var ourratings  = ArrayBuffer.empty[User_ratings.rating]
                        var i = 1
                        for(product <- topRatedProduct) {
                            breakable {
                                while(true) {
                                    try {
                                        println(i.toString + ") Your rating for: " + product._2 + "," )
                                        val rating = scala.io.StdIn.readInt()
                                        if(rating < 5.1 && rating > 0) {
                                            ourratings += User_ratings.rating("A/E",product._2,rating)
                                                    i += 1
                                                    break
                                        }
                                    } catch {
                                    case e: Exception => println("Invalid rating");
                                    }
                                }
                            }
                        }
        return spark.sparkContext.parallelize(ourratings)
}
def main(args: Array[String]) {
    var spark : SparkSession = null
            var fw : FileWriter= null
            try{

                spark = SparkSession.builder.appName("Spark sql").config("spark.master","local[*]").getorCreate()
                        val sc = spark.sparkContext
                        var csv_file = "Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv"
                        val sqlContext = new org.apache.spark.sql.sqlContext(sc)
                        Logger.getLogger("org").setLevel(Level.OFF)
                        Logger.getLogger("akka").setLevel(Level.OFF)

                        //Loading Products
                        val products = CollabarativeFiltering.readProduct(csv_file,spark)
                        products.cache()

                        products.take(10).foreach(println)

                        //Loading ratings
                        val ratings = User_ratings.readratings(csv_file,spark)
                        ratings.take(10).foreach(println)

                        //Checking  Top Rated Products

                        val topRatedProduct = topRatedProducts(products,ratings,10)
                        topRatedProduct.take(10).foreach(println)

                        // Ask user to rate 10 top rated product
                        val ourratings = getratings(topRatedProduct,spark)

                        // Add User ratings
                        val editedratings = ratings.union(ourratings)

                        //normalizing the ratings
                        val normalizedratings = User_ratings.normalizingratings(editedratings)

                        // Training the model
                        val Array(train,test) = normalizedratings.randomSplit(Array(0.8,0.2))
                        train.cache()
                        test.cache()
                        val ranks = Array(8,12)
                        val numIterations =Array(1,5,10)
                        //val alpha = 0.01
                        val lambdas = Array(10,20)

                        fw = new FileWriter("Results.txt",true)
                        println("RANK ---- LAMBDA --- IteraTION ---- MSE" )
                        fw.write("RANK ---- LAMBDA --- IteraTION ---- MSE\n" )

                        for(i <- ranks) {
                            for(j <- lambdas) {
                                for(k <- numIterations) {
                                    // Statistics about the runtime of training
                                    val start = System.nanoTime()                      

                                            val als_model = ALS.train(train,i,k,j)            

                                            // Shape our data by removing rating So that we wil predict the ratings for them
                                            val usersProducts = test.map(f => (f.userID,f.productID))

                                            // Predict
                                            val predictions = als_model.predict(usersProducts).map(f => ((f.user,f.product),f.rating))

                                            // We hold (user,movie) as a Key and (real rating,predicted rating) pair as Tuple
                                            val real_and_predictions = test.map(f => ((f.userID,f.productID),f.rating)).join(predictions)

                                            // Calculate Mean Square Error
                                            val mean_square_err = real_and_predictions.map(f => sqr(f._2._1 - f._2._2)).mean()

                                            print(i.toString + " -------- " + j.toString + " --------" + k.toString +  " -------- ")
                                            println(mean_square_err.toString + "\n")
                                            println("Time elapsed: " + (System.nanoTime()-start)/1e9 )

                                            fw.write(i.toString + " -------- " + j.toString + " --------" + k.toString +  " -------- ")
                                            fw.write(mean_square_err.toString + "\n")
                                            fw.write("Time elapsed: " + ((System.nanoTime()-start)/1e9).toString + "\n" )
                                }
                            }
                        }
                
            }

catch{
case e : Exception => throw e
}finally {
    spark.stop()
}
println("done")
}
}

User_ratings 的代码：

    object User_ratings {
case class rating(userID: String,productID: String,rating: Int)

// Create rating object from Row
def parserating(fields: Row): rating = {
        rating(fields(0).toString,fields(4).toString,fields(18).toString().toInt)
}

// Read ratings from csv file and create RDD[rating]
def readratings(location:String,spark: SparkSession): RDD[rating] = {
        val ratings = spark.read.option("header","true").csv(location).rdd.map(parserating)
                return ratings
}

// normalizing the  ratings by dividing user's rating to average of user's ratings
def normalizingratings(ratings : RDD[User_ratings.rating]) : RDD[User_ratings.rating] = {
        // Grouping according to user.
        val ratingsofUsers = ratings.groupBy(f => f.userID).map( x => (x._1,x._2.map( r => r.rating).sum / x._2.size ) )

                // Collecting as Map
                val userMap = ratingsofUsers.collect().toMap

                // normalizing the ratings
                val normalizedratings = ratings.map( f => rating(f.userID,f.productID,f.rating / userMap(f.userID) ) )

                return normalizedratings
}

def main(args: Array[String]): Unit = {

        var spark: SparkSession = null
                var fw :FileWriter = null

                try {

                    spark = SparkSession.builder.appName("Spark sql").config("spark.master","local[*]").getorCreate()
                            val sc = spark.sparkContext     
                            val sqlContext = new org.apache.spark.sql.sqlContext(sc)
                            Logger.getLogger("org").setLevel(Level.OFF)
                            Logger.getLogger("akka").setLevel(Level.OFF)

                } catch {
                case e: Exception => throw e
                } finally {
                    val end = System.nanoTime()
                            spark.stop()
                            fw.close()
                }
println("done")
}
}

/*
       Reference : https://spark.apache.org/docs/latest/ml-collaborative-filtering.html
 */

问题是当我使用：

val als_model = ALS.train(train,j)

它给出：

找到预期的 org.apache.spark.mllib.recommendation.RDD[rating] RDD[User_ratings.rating]

我想使用 ALS 来训练我的 RDD，但不能。如果不可能，还有其他方法可以训练我的数据向用户推荐类似产品吗？

解决方法

实际上，我应用的基本解决方案是将 hash() 函数用于我的 String 类型 UserID 和 ProdcutId。所以格式与 Machine Learning Rating Class 匹配。

似乎ALS只支持数字作为训练数据，所以你应该构建一个映射来将字符串字段转换为int。

看看this

als machine-learning rdd scala scala