WordCount实现的11种方法
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object Spark04_WordCount {
def main(args: Array[String]): Unit = {
val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
val sc = new SparkContext(sparConf)
wordCount1(sc)
sc.stop()
}
def wordCount1(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val group: RDD[(String, Iterable[String])] = words.groupBy(word => word)
val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
wordCount.collect().foreach(println)
}
def wordCount2(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_, 1))
val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
}
def wordCount3(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
}
def wordCount4(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
}
def wordCount5(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
}
def wordCount6(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
v => v,
(x: Int, y) => x + y,
(x: Int, y: Int) => x + y
)
}
def wordCount7(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val wordOne = words.map((_, 1))
val stringToLong: collection.Map[String, Long] = wordOne.countByKey()
}
def wordCount8(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val stringToLong: collection.Map[String, Long] = words.countByValue()
}
def wordCount91011(sc: SparkContext): Unit = {
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
val mapWord = words.map(
word => {
mutable.Map[String, Long]((word, 1))
}
)
val wordCount = mapWord.reduce(
(map1,map2) => {
map2.foreach{
case (word,count) => {
val newCount = map1.getOrElse(word,0L) + count
map1.update(word,newCount)
}
}
map1
}
)
println(wordCount)
}
}
|