明年考研了,想着寫個簡單的檢查詞頻的程序,這個程序功能就是,讀取文本,分割出每個單詞,以及每個單詞出現的次數,保存到數據庫並且導出文本。寫的比較粗糙,由於不會Java爬蟲,要讀取的文本只能自己去網上找一些文獻複製,數據庫那裏用的基礎的jdbc寫的比較混亂,翻譯調用的百度翻譯的接口,開發環境IDEA。項目保存在https://github.com/Adam-hohai/WordSplit,我用的英文報刊是參考的何凱文的每日一句。
package hhuc.cenhelm;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import hhuc.cenhelm.tools.TransApi;
import java.io.*;
import java.sql.*;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
public class Main {
/**
* 百度翻譯的id和密鑰
*/
private static final String APP_ID = "*********";
private static final String SECURITY_KEY = "*********";
/**
* 讀取txt文件
*
* @param pathName txt路徑
* @return 文件內容字符串
*/
public static String read(String pathName) throws IOException {
StringBuilder content = new StringBuilder();
File fileName = new File(pathName);
InputStreamReader reader = new InputStreamReader(new FileInputStream(fileName));
BufferedReader br = new BufferedReader(reader);
String line = br.readLine();
content.append(line);
while (line != null) {
line = br.readLine();
content.append(line);
}
return content.toString();
}
/**
* 將英文文獻中的單詞分離保存到鍵值對
*
* @param rule 分離規則
* @param content 文獻內容
* @return 單詞和次數的鍵值對
*/
public static Map<String, Integer> splitOut(String content, String rule) {
StringTokenizer st = new StringTokenizer(content, rule);
Map<String, Integer> wordMap = new HashMap<String, Integer>();
while (st.hasMoreElements()) {
String word = st.nextElement().toString().toLowerCase();
// System.out.println(word);
if (word.length() > 3) {
if (wordMap.get(word) == null) {
wordMap.put(word, 1);
} else {
int frequency = wordMap.get(word);
wordMap.remove(word);
wordMap.put(word, frequency + 1);
}
}
}
return wordMap;
}
/**
* 調用百度翻譯接口
*
* @param transApi 百度翻譯接口
* @param word 要翻譯的內容
* @return 翻譯結果
*/
public static String translate(TransApi transApi, String word) {
//百度翻譯接口返回的是json字符串
String jsonResult = transApi.getTransResult(word, "auto", "zh");
StringTokenizer tokenizer = new StringTokenizer(jsonResult, "\"}]");
String result = "";
//找到最後一個
while (tokenizer.hasMoreTokens()) {
result = tokenizer.nextToken();
}
//將json字符串簡化一下
String json = "{\"result\":\"" + result + "\"}";
JSONObject jsonObject = JSON.parseObject(json);
return jsonObject.get("result").toString();
}
/**
* 連接數據庫
*
* @return 返回Connection對象
* @throws Exception 可能拋出異常
*/
public static Connection getConnection() throws Exception {
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver");
String url = "jdbc:sqlserver://127.0.0.1:1433;DatabaseName=EnNewspaperHKW";
String user = "sa";
String password = "*****";
Connection conn = DriverManager.getConnection(url, user, password);
System.out.println("數據庫連接成功");
return conn;
}
public static void main(String[] args) throws Exception {
// write your code here
String content = null;
try {
content = read("src/main/resources/2020033150.txt");
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(content);
String rule = ", !./';:?\"()“”‘’-—$%#!&*——_1234567890|`~·[]{}";
Map<String, Integer> wordMap = splitOut(content, rule);
//調用百度翻譯的接口
TransApi transApi = new TransApi(APP_ID, SECURITY_KEY);
//遍歷鍵值對,操作數據庫
Connection connection = getConnection();
PreparedStatement psSel = null, psIns = null, psUpd = null;
ResultSet rsSel = null;
String sqlSel, sqlIns, sqlUpd = "";
Iterator<String> iterator = wordMap.keySet().iterator();
while (iterator.hasNext()) {
String word = iterator.next();
String translation = translate(transApi, word);
int frequency = (int)wordMap.get(word);
System.out.println(word + " " + translation + " " + frequency);
//寫入txt
File fileName = new File("src/main/resources/2020033150output.txt");
BufferedWriter out = new BufferedWriter(new FileWriter(fileName,true));//文件追加
out.write(word + " " + translation + " " + frequency + "\r\n");
out.flush();
out.close();
//操作數據庫
sqlSel = "select * from hkwDailyData where word=?";
psSel = connection.prepareStatement(sqlSel);
psSel.setString(1, word);
rsSel = psSel.executeQuery();
if (rsSel.next()) {
sqlUpd = "update hkwDailyData set frequency = frequency +? where word =?";
psUpd = connection.prepareStatement(sqlUpd);
psUpd.setInt(1,frequency);
psUpd.setString(2,word);
psUpd.executeUpdate();
System.out.println("更新成功");
} else {
sqlIns = "insert into hkwDailyData(word,translation,frequency) values (?,?,?)";
psIns = connection.prepareStatement(sqlIns);
psIns.setString(1, word);
psIns.setString(2, translation);
psIns.setInt(3, frequency);
psIns.executeUpdate();
System.out.println("插入成功");
}
}
psUpd.close();
psIns.close();
psSel.close();
rsSel.close();
connection.close();
System.out.println("提取結束");
}
}