准备工作
查看数据 创建SparkContext
val spark = new SparkConf().setMaster("local[6]").setAppName("wordCount")
val sc = new SparkContext(spark)
val rdd = sc.textFile("data/wordcount.txt")
使用groupBy
@Test
def test1():Unit = {
rdd.flatMap(_.split(" "))
.groupBy(word => word)
.mapValues(_.size)
.collect()
.foreach(println(_))
}
使用groupByKey
效率不高
@Test
def groupByKey():Unit = {
rdd.flatMap(_.split(" "))
.map((_,1))
.groupByKey()
.mapValues(_.size)
.collect()
.foreach(println(_))
}
使用reduceByKey
@Test
def reduceByKey():Unit = {
rdd.flatMap(_.split(" "))
.map((_,1))
.reduceByKey(_+_)
.collect()
.foreach(println(_))
}
使用aggregateByKey
@Test
def aggregateByKey():Unit = {
rdd.flatMap(_.split(" "))
.map((_,1))
.aggregateByKey(0)(_+_,_+_)
.collect()
.foreach(println(_))
}
使用foldByKey
@Test
def foldByKey():Unit = {
rdd.flatMap(_.split(" "))
.map((_,1))
.foldByKey(0)(_+_)
.collect()
.foreach(println(_))
}
使用combineByKey
@Test
def combineByKey(): Unit = {
rdd.flatMap(_.split(" "))
.map((_, 1))
.combineByKey(
v => v,
(x: Int, y) => x + y,
(x: Int, y: Int) => x + y
)
.collect()
.foreach(println(_))
}
使用countByKey
@Test
def countByKey(): Unit = {
rdd.flatMap(_.split(" "))
.map((_, 1))
.countByKey()
}
使用countByValue
@Test
def countByvalue(): Unit = {
rdd.flatMap(_.split(" "))
.countByValue()
}
|