缺失值的定义
缺失值是指粗糙数据中由于缺少信息而造成的数据的聚类、分组、删失或截断。它指的是现有数据集中某个或某些属性的值是不完全的。
python 中的空值
scala 中的空值
- Unit 表示无值,和其他语言中void等同。用作不返回任何结果的方法的结果类型。Unit只有一个实例值,写成()。
- Null null 或空引用
空值是 scala.Null 类型。
Scala.Null和scala.Nothing是用统一的方式处理Scala面向对象类型系统的某些"边界情况"的特殊类型。
Null类是null引用对象的类型,它是每个引用类(继承自AnyRef的类)的子类。Null不兼容值类型。
缺失值填充的目的
spark 缺失值填充
dataset 的 na 主要出现在下面的函数中
package org.apache.spark.sql
...
def na : org.apache.spark.sql.DataFrameNaFunctions = { }
# 函数罗列如下:
package org.apache.spark.sql
@org.apache.spark.annotation.InterfaceStability.Stable
final class DataFrameNaFunctions private[sql] (df : org.apache.spark.sql.DataFrame) extends scala.AnyRef {
def drop() : org.apache.spark.sql.DataFrame = { }
def drop(how : scala.Predef.String) : org.apache.spark.sql.DataFrame = { }
def drop(cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def drop(cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def drop(how : scala.Predef.String, cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def drop(how : scala.Predef.String, cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def drop(minNonNulls : scala.Int) : org.apache.spark.sql.DataFrame = { }
def drop(minNonNulls : scala.Int, cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def drop(minNonNulls : scala.Int, cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Long) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Double) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Predef.String) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Long, cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Double, cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Long, cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Double, cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Predef.String, cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Predef.String, cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Boolean) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Boolean, cols : scala.Seq[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(value : scala.Boolean, cols : scala.Array[scala.Predef.String]) : org.apache.spark.sql.DataFrame = { }
def fill(valueMap : java.util.Map[scala.Predef.String, scala.Any]) : org.apache.spark.sql.DataFrame = { }
def fill(valueMap : scala.Predef.Map[scala.Predef.String, scala.Any]) : org.apache.spark.sql.DataFrame = { }
def replace[T](col : scala.Predef.String, replacement : java.util.Map[T, T]) : org.apache.spark.sql.DataFrame = { }
def replace[T](cols : scala.Array[scala.Predef.String], replacement : java.util.Map[T, T]) : org.apache.spark.sql.DataFrame = { }
def replace[T](col : scala.Predef.String, replacement : scala.Predef.Map[T, T]) : org.apache.spark.sql.DataFrame = { }
def replace[T](cols : scala.Seq[scala.Predef.String], replacement : scala.Predef.Map[T, T]) : org.apache.spark.sql.DataFrame = { }
}
注意主要出现了几个动作:
drop
def drop(how: String): DataFrame Returns a new DataFrame that drops rows containing null or NaN values.
If how is “any”, then drop rows containing any null or NaN values. If how is “all”, then drop rows only if every column is null or NaN for that row.
fill
def fill(valueMap: Map[String, Any]): DataFrame (Scala-specific) Returns a new DataFrame that replaces null values.
The key of the map is the column name, and the value of the map is the replacement value. The value must be of the following type: Int, Long, Float, Double, String, Boolean. Replacement values are cast to the column data type.
For example, the following replaces null values in column “A” with string “unknown”, and null values in column “B” with numeric value 1.0.
df.na.fill(Map(
"A" -> "unknown",
"B" -> 1.0
))
replace
参考文献
最新版文档:
|