本文记录了使用spark实现item2vec算法的相关内容,欢迎做相关工作的同学与我联系zhaoliang19960421@outlook.com
object Item2Vec {
def main(args: Array[String]): Unit = {
val Array(locale: String, startDate: String, endDate: String) = args
val sparkSession: SparkSession = createSparkSession(this.getClass.getSimpleName)
val userItemSeqDf = getUserItemSeq(sparkSession, startDate, endDate)
val model = getWord2VecModel(userItemSeqDf, "usage_seq", "vector")
val itemVec = getItemVec(model)
val userVec = getUserVec(sparkSession, userItemSeqDf, itemVec)
}
def getItemSim(model: Word2VecModel, item: String, topN: Int): Unit = {
try {
println(s"$item 最相似的前${topN}个结果是:")
model.findSynonyms(item, topN).show(truncate = false)
} catch {
case ex: Exception => println(s"$item 不存在")
}
}
def getUserVec(sparkSession: SparkSession, orgDf: DataFrame, itemVec: DataFrame): DataFrame = {
val arrayDefaultVec = new Array[Double](200)
def itemVecAagPoolingUDF(map: scala.collection.Map[String, Array[Double]]): UserDefinedFunction = udf((seq: mutable.WrappedArray[String]) => {
val res = ArrayBuffer[Array[Double]]()
res.appendAll(seq.map(map.getOrElse(_, arrayDefaultVec)))
val tmp: (Array[Double], Int) = res.map(e => (e, 1)).reduce((x, y) => {
(x._1.zip(y._1).map(a => a._1 + a._2), x._2 + y._2)
})
if (tmp._2 > 0) tmp._1.map(e => e / tmp._2)
else arrayDefaultVec
})
val itemVecBC = sparkSession.sparkContext.broadcast(itemVec.rdd.map(r => (r.getString(0), r.getSeq[Double](1).toArray)).collectAsMap())
val userVecDf = orgDf
.withColumn("vector", itemVecAagPoolingUDF(itemVecBC.value)(col("usage_seq")))
.select("gaid", "vector")
userVecDf
}
def getItemVec(model: Word2VecModel): DataFrame = {
def vector2ArrayUDF(): UserDefinedFunction = udf((vec: Vector) => {
val norm = Vectors.norm(vec, 2)
vec.toArray.map(e => if (norm != 0) e / norm else 0.0)
})
val itemVec = model.getVectors
.select(col("word").as("pkg"), col("vector").as("org_vector"))
.withColumn("vectorArray", vector2ArrayUDF()(col("vector")))
.selectExpr("word as item", "vectorArray")
itemVec
}
def getUserItemSeq(sparkSession: SparkSession, startDate: String, endDate: String): DataFrame = {
def getSeqUDF(): UserDefinedFunction = udf((seq: mutable.WrappedArray[GenericRowWithSchema]) => {
val listSeq = ArrayBuffer[String]()
seq.sortBy(e => e.getAs[Long]("timestamp"))
var pkg = seq.head.getAs[String]("pkg")
var open = seq.head.getAs[Long]("timestamp")
var dura = seq.head.getAs[Double]("duration")
listSeq.append(pkg)
seq.drop(0).foreach(e => {
val tmp_pkg = e.getAs[String]("pkg")
val tmp_open = e.getAs[Long]("timestamp")
val tmp_dura = e.getAs[Double]("duration")
if (!tmp_pkg.equals(pkg) || (tmp_pkg.equals(pkg) && ((tmp_open - open) / 1000 - dura > 10)))
listSeq.append(tmp_pkg)
pkg = tmp_pkg
open = tmp_open
dura = tmp_dura
})
listSeq
})
val dfAppUsage = sparkSession.read.parquet("hdfs://***")
.where(s"date between $startDate and $endDate")
.groupBy("gaid")
.agg(collect_list(struct("pkg", "timestamp", "duration")).as("seq"))
.withColumn("usage_seq", getSeqUDF()(col("seq")))
.withColumn("seq_len", size(col("usage_seq")))
.where("seq_len > 10")
.selectExpr("gaid", "usage_seq")
dfAppUsage
}
def getWord2VecModel(orgDf: DataFrame, inputCol: String, outputCol: String): Word2VecModel = {
val model: Word2VecModel = new Word2Vec()
.setInputCol(inputCol)
.setOutputCol(outputCol)
.setSeed(1024)
.setMaxIter(10)
.setMinCount(5)
.setVectorSize(200)
.setWindowSize(5)
.setNumPartitions(1000)
.setMaxSentenceLength(100)
.fit(orgDf)
model
}
}
|