Spark+Ansj中文分詞Scala程序

Spark下四種中文分詞工具使用
hanLP
ansj
jieba
fudannlp

推薦使用ansj,速度快而且效果好
另外jieba,hanLP效果也不錯。
具體參考
ansj:https://github.com/NLPchina/ansj_seg
HanLP:https://github.com/hankcs/HanLP

我的代碼如下,加了scala連接mysql數據庫查找、插入操作,添加自定義詞典,添加停用詞詞典,和Spark RDD實現wordcount的相關知識。

package WordCloud

import java.sql.{Connection, DriverManager}
import java.{util}

import Mysql.ConvertToJson
import domain.tb_analyze_professional_skill
import org.ansj.library.DicLibrary

import scala.io.Source
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.{DicAnalysis, ToAnalysis}
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf

/**
  * Created by ljq on 19-2-23.
  */
object WordCloud {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Wordcloud").setMaster("local[4]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val jdbcDF = spark.read.format("jdbc").option("url", "jdbc:mysql://xx:3306/job_data?useSSL=false")
      .option("driver", "com.mysql.jdbc.Driver")
      .option("dbtable", "job_data")
      .option("user", "xx")
      .option("password", "xx").load()
    val data = jdbcDF.rdd.map(x => x(11))

    //添加自定義詞典
    val dicfile = raw"/home/zq/Desktop/ExtendDic" //ExtendDic爲一個文本文件的名字,裏面每一行存放一個詞
    //逐行讀入文本文件,將其添加到自定義詞典中
    for (word <- Source.fromFile(dicfile).getLines) {
      DicLibrary.insert(DicLibrary.DEFAULT, word)
    }

    //添加停用詞詞典
    val stopworddicfile = raw"/home/zq/Desktop/StopWordDic" //stopworddicfile爲一個文本文件的名字,裏面每一行存放一個詞
    val filter = new StopRecognition()
    filter.insertStopNatures("w", null) //過濾掉標點
    filter.insertStopRegexes("^[0-9]*$", "\\s*") //過濾掉數字和空字符
    for (word <- Source.fromFile(stopworddicfile).getLines) {
      filter.insertStopWords(word)
    }

    val splited = data.filter(_ != null).map(x => DicAnalysis.parse(x.toString).recognition(filter).toStringWithOutNature(" ")) //.replaceAll("\\s*", "")

    val wordcloud = splited.cache().flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _, 1).sortBy(_._2, false).take(20)

    val list = new util.ArrayList[tb_analyze_professional_skill]()

    wordcloud.foreach(x => list.add(tb_analyze_professional_skill(x._1, x._2.toString)))

    val str = ConvertToJson.ToJson(list)

    insert(str)
    /*data.foreach(println)*/
    /*val jrdd = jdbcDF.collect()
    println(jrdd.toBuffer)*/
  }

  def insert(result: String): Unit = {
    val Driver = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://xx:3306/job_data?useUnicode=true&characterEncoding=utf8" +
      "&useSSL=false"
    var conn: Connection = null
    var ps: java.sql.PreparedStatement = null
    val sql = "insert into tb_analyze_professional_skill(result) values(?)"
    try {
      Class.forName(Driver)
      conn = DriverManager.getConnection(url, "xx", "xx")
      ps = conn.prepareStatement(sql)
      ps.setString(1, result)
      ps.executeUpdate()
    }
    catch {
      case e: Exception => e.printStackTrace
    }
    conn.close()
    ps.close()
  }
}

package domain

/**
  * Created by ljq on 19-2-23.
  */
case class job_info(id: Int, direction: Int, job_name: String, company_name: String, job_site: String, job_salary: String, avr_salary: Double, relase_date: String, education_level: String, work_exper: String, company_welfare: String, job_resp: String, job_require: String, company_type: String, company_people_num: String, company_business: String)

case class tb_analyze_professional_skill(name:String,value:String)

case class tb_analyze_job_requirements(name:String,value:String)

package Mysql;

import com.google.gson.Gson;
import domain.tb_analyze_professional_skill;

import java.util.ArrayList;

/**
 * Created by ljq on 19-2-24.
 */
public class ConvertToJson {
    public static String ToJson(ArrayList<tb_analyze_professional_skill> list) {
        Gson gson = new Gson();
        String gsonStr = gson.toJson(list);
        return gsonStr;
    }
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章