提取英語單詞並翻譯存入數據庫並導出

明年考研了,想着寫個簡單的檢查詞頻的程序,這個程序功能就是,讀取文本,分割出每個單詞,以及每個單詞出現的次數,保存到數據庫並且導出文本。寫的比較粗糙,由於不會Java爬蟲,要讀取的文本只能自己去網上找一些文獻複製,數據庫那裏用的基礎的jdbc寫的比較混亂,翻譯調用的百度翻譯的接口,開發環境IDEA。項目保存在https://github.com/Adam-hohai/WordSplit,我用的英文報刊是參考的何凱文的每日一句。

package hhuc.cenhelm;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import hhuc.cenhelm.tools.TransApi;

import java.io.*;
import java.sql.*;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;

public class Main {

    /**
     * 百度翻譯的id和密鑰
     */
    private static final String APP_ID = "*********";
    private static final String SECURITY_KEY = "*********";

    /**
     * 讀取txt文件
     *
     * @param pathName txt路徑
     * @return 文件內容字符串
     */
    public static String read(String pathName) throws IOException {
        StringBuilder content = new StringBuilder();
        File fileName = new File(pathName);
        InputStreamReader reader = new InputStreamReader(new FileInputStream(fileName));
        BufferedReader br = new BufferedReader(reader);
        String line = br.readLine();
        content.append(line);
        while (line != null) {
            line = br.readLine();
            content.append(line);
        }
        return content.toString();
    }

    /**
     * 將英文文獻中的單詞分離保存到鍵值對
     *
     * @param rule    分離規則
     * @param content 文獻內容
     * @return 單詞和次數的鍵值對
     */
    public static Map<String, Integer> splitOut(String content, String rule) {
        StringTokenizer st = new StringTokenizer(content, rule);
        Map<String, Integer> wordMap = new HashMap<String, Integer>();
        while (st.hasMoreElements()) {
            String word = st.nextElement().toString().toLowerCase();
//            System.out.println(word);
            if (word.length() > 3) {
                if (wordMap.get(word) == null) {
                    wordMap.put(word, 1);
                } else {
                    int frequency = wordMap.get(word);
                    wordMap.remove(word);
                    wordMap.put(word, frequency + 1);
                }
            }

        }
        return wordMap;
    }

    /**
     * 調用百度翻譯接口
     *
     * @param transApi 百度翻譯接口
     * @param word     要翻譯的內容
     * @return 翻譯結果
     */
    public static String translate(TransApi transApi, String word) {
        //百度翻譯接口返回的是json字符串
        String jsonResult = transApi.getTransResult(word, "auto", "zh");
        StringTokenizer tokenizer = new StringTokenizer(jsonResult, "\"}]");
        String result = "";
        //找到最後一個
        while (tokenizer.hasMoreTokens()) {
            result = tokenizer.nextToken();
        }
        //將json字符串簡化一下
        String json = "{\"result\":\"" + result + "\"}";
        JSONObject jsonObject = JSON.parseObject(json);
        return jsonObject.get("result").toString();
    }

    /**
     * 連接數據庫
     *
     * @return 返回Connection對象
     * @throws Exception 可能拋出異常
     */
    public static Connection getConnection() throws Exception {
        Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver");
        String url = "jdbc:sqlserver://127.0.0.1:1433;DatabaseName=EnNewspaperHKW";
        String user = "sa";
        String password = "*****";
        Connection conn = DriverManager.getConnection(url, user, password);
        System.out.println("數據庫連接成功");
        return conn;
    }

    public static void main(String[] args) throws Exception {
        // write your code here

        String content = null;
        try {
            content = read("src/main/resources/2020033150.txt");
        } catch (Exception e) {
            e.printStackTrace();
        }
        System.out.println(content);
        String rule = ", !./';:?\"()“”‘’-—$%#!&*——_1234567890|`~·[]{}";
        Map<String, Integer> wordMap = splitOut(content, rule);
        //調用百度翻譯的接口
        TransApi transApi = new TransApi(APP_ID, SECURITY_KEY);
        //遍歷鍵值對,操作數據庫
        Connection connection = getConnection();
        PreparedStatement psSel = null, psIns = null, psUpd = null;
        ResultSet rsSel = null;
        String sqlSel, sqlIns, sqlUpd = "";
        Iterator<String> iterator = wordMap.keySet().iterator();
        while (iterator.hasNext()) {
            String word = iterator.next();
            String translation = translate(transApi, word);
            int frequency = (int)wordMap.get(word);
            System.out.println(word + " " + translation + " " + frequency);

            //寫入txt
            File fileName = new File("src/main/resources/2020033150output.txt");
            BufferedWriter out = new BufferedWriter(new FileWriter(fileName,true));//文件追加
            out.write(word + " " + translation + " " + frequency + "\r\n");
            out.flush();
            out.close();

            //操作數據庫
            sqlSel = "select * from hkwDailyData where word=?";
            psSel = connection.prepareStatement(sqlSel);
            psSel.setString(1, word);
            rsSel = psSel.executeQuery();
            if (rsSel.next()) {
                sqlUpd = "update hkwDailyData set frequency = frequency +? where word =?";
                psUpd = connection.prepareStatement(sqlUpd);
                psUpd.setInt(1,frequency);
                psUpd.setString(2,word);
                psUpd.executeUpdate();
                System.out.println("更新成功");

            } else {
                sqlIns = "insert into hkwDailyData(word,translation,frequency) values (?,?,?)";
                psIns = connection.prepareStatement(sqlIns);
                psIns.setString(1, word);
                psIns.setString(2, translation);
                psIns.setInt(3, frequency);
                psIns.executeUpdate();
                System.out.println("插入成功");
            }

        }
        psUpd.close();
        psIns.close();
        psSel.close();
        rsSel.close();
        connection.close();
        System.out.println("提取結束");
    }
}

 

 

 

 

 

 

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章