SparkSQL粗分析
大致原理
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
val c0: SparkConf = new SparkConf().setAppName("a0").setMaster("local")
val sc: SparkContext = new SparkContext(c0)
val c1: SparkConf = new SparkConf().setAppName("a1").setMaster("local")
val spark: SparkSession = SparkSession.builder().config(c1).getOrCreate()
import spark.implicits._
sc.makeRDD(Seq(
("u1", "o1", "g1"),
("u1", "o1", "g2"),
("u2", "o2", "g1"),
("u1", "o3", "g1"),
("u1", "o3", "g2"),
("u1", "o3", "g3"),
("u3", "o4", "g3"),
("u4", "o5", "g1"),
("u5", "o6", "g4"),
)).toDF("user_id", "order_id", "good_id").createTempView("t0")
spark.sql(
"""
|SELECT COLLECT_SET(good_id)good_set FROM t0
|GROUP BY user_id
|HAVING SIZE(good_set) > 1
|""".stripMargin).createTempView("t1")
spark.sql(
"""
|SELECT good_set,count(good_set)c FROM t1
|GROUP BY good_set
|ORDER BY c DESC
|""".stripMargin).show
SQL计算结果略为粗糙
共现频数模型
大致原理
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
val c0: SparkConf = new SparkConf().setAppName("a0").setMaster("local")
val sc: SparkContext = new SparkContext(c0)
val c1: SparkConf = new SparkConf().setAppName("a1").setMaster("local")
val spark: SparkSession = SparkSession.builder().config(c1).getOrCreate()
import spark.implicits._
sc.makeRDD(Seq(
("o1", "g1"),
("o1", "g2"),
("o2", "g1"),
("o3", "g1"),
("o3", "g2"),
("o3", "g3"),
("o4", "g3"),
("o5", "g1"),
("o6", "g4"),
)).toDF("order_id", "good_id").createTempView("dwd_order_detail")
val df = spark.sql(
"""
|SELECT COLLECT_SET(good_id) FROM dwd_order_detail
|GROUP BY order_id
|""".stripMargin).toDF("items")
import org.apache.spark.ml.fpm.FPGrowth
val fpGrowth = new FPGrowth().setMinSupport(0).setMinConfidence(0)
val model = fpGrowth.fit(df)
model.freqItemsets.show
model.associationRules.show
model.transform(Seq(
"g1",
"g2",
"g3",
"g4",
"g1 g2",
"g1 g3",
"g2 g3",
).map(_.split(" ")).toDF("items")).show
C
o
n
f
i
d
e
n
c
e
g
3
?
g
1
=
F
r
e
q
g
1
,
g
3
F
r
e
q
g
3
=
1
2
Confidence_{g3-g1}=\frac{Freq_{g1,g3}}{Freq_{g3}}=\frac{1}{2}
Confidenceg3?g1?=Freqg3?Freqg1,g3??=21?
C
o
n
f
i
d
e
n
c
e
g
1
?
g
3
=
F
r
e
q
g
1
,
g
3
F
r
e
q
g
1
=
1
4
Confidence_{g1-g3}=\frac{Freq_{g1,g3}}{Freq_{g1}}=\frac{1}{4}
Confidenceg1?g3?=Freqg1?Freqg1,g3??=41?
|