object SparkAutoBloomFilterDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
import spark.implicits._
val df:DataFrame = getGenerDataframe(3,spark)
val df2:DataFrame = df
println("=====df")
df.show
println("=====df2")
df2.show
val bf = df.stat.bloomFilter("id_",df.count().toInt,0.01)
println("=====dfmight")
df2.where(!might_contain(bf)($"id_")).show
}
def might_contain(f: org.apache.spark.util.sketch.BloomFilter) = udf((x: String) =>
if(x != null) f.mightContain(x) else false)
}
要记住布隆过滤器去重的那一列不能是数字
|