原创

spark读取GBK文件的方法

spark读取GBK文件乱码

spark的textFile方法默认写死了读UTF-8格式的文件,读其他格式的文件则会显示乱码。如下面的代码所示


object Test2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PowerLeo")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    val sc = spark.sparkContext

    val fileRdd = sc.textFile("file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
    fileRdd.foreach(println(_))

    spark.stop()
  }
}

结果截图:

spark

解决方法


object Test2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PowerLeo")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    val sc = spark.sparkContext

    //val fileRdd = sc.textFile("file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
    //val fileRdd = transfer(sc, "file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
    val fileRdd = loadFileToRdd(sc,"file:///C:\\\\Users\\\\leo\\\\Desktop\\\\20190704\\\\bigdata_buy2_fq_2019_07_04.DAT")

    fileRdd.foreach(println(_))

    spark.stop()
  }

  def transfer(sc: SparkContext, path: String): RDD[String] = {
    sc.hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], 1)
      .map(p => new String(p._2.getBytes, 0, p._2.getLength, "GBK"))
  }

  /**
    *
    * @param sc       spark context 上下文
    * @param path     文件读取路径
    * @param encoding 文件编码
    * @return (文件路径名,文件行内容)
    */
  def loadFileToRdd(sc: SparkContext, path: String, encoding: String = "GBK"): RDD[(String, String, Int)] = {
    sc.hadoopFile[LongWritable, Text, TextInputFormat](path)
      .asInstanceOf[HadoopRDD[LongWritable, Text]]
      .mapPartitionsWithInputSplit((inputSplit: InputSplit, iterator: Iterator[(LongWritable, Text)]) => {
        val file = inputSplit.asInstanceOf[FileSplit]
        iterator.filter(x => x._2 != null).map(x => {
          (file.getPath.getName, new String(x._2.getBytes, 0, x._2.getLength, encoding), 1)
        })
      })
  }
}

运行截图:

res

正文到此结束