微博數據清洗(Java版)

大數據公益大學提供的一份數據,義務處理一下,原始數據是Excel,含有html標籤,如下:



要求清洗掉html標籤,和微博內容中的url地址。


主要分爲兩部分:

1.處理文本,清洗數據。

2.處理excel讀寫操作。


上代碼:

ExcelUtil類,包含Excel2003-2007的讀寫操作,Excel使用Apache POI進行操作,需要jar包如下:


package dat.datadeal;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

/**
 * 
 * @author daT [email protected]
 *2003,2007版excel讀寫工具
 */
public class ExcelUtil{
	
	/**
	 * Excel文件讀取
	 * @param filePath
	 * @return String[]存的是行,List存的是列。
	 * 一個excel一次全部讀入內存(Excel超大需要另行處理)
	 */
	public  List<String[]> readExcel(String filePath) {  
        List<String[]> dataList = new ArrayList<String[]>();  
        boolean isExcel2003 = true;  
        if (isExcel2007(filePath)) {  
            isExcel2003 = false;  
        }  
        File file = new File(filePath);  
        InputStream is = null;  
        try {  
            is = new FileInputStream(file);  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);  
        }  
        Workbook wb = null;  
        try {  
            wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);  
        } catch (IOException ex) {  
            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);  
        }  
        Sheet sheet = wb.getSheetAt(0);  
        int totalRows = sheet.getPhysicalNumberOfRows();  
        int totalCells = 0;  
        if (totalRows >= 1 && sheet.getRow(0) != null) {  
            totalCells = sheet.getRow(0).getPhysicalNumberOfCells();  
        }  
        for (int r = 0; r < totalRows; r++) {  
            Row row = sheet.getRow(r);  
            if (row == null) {  
                continue;  
            }  
            String[] rowList = new String[totalCells];  
            for (int c = 0; c < totalCells; c++) {  
                Cell cell = row.getCell(c);  
                String cellValue = "";  
                if (cell == null) {  
                    rowList[c] = (cellValue);  
                    continue;  
                }  
                cellValue = ConvertCellStr(cell, cellValue);  
                rowList[c] = (cellValue);  
            }  
            dataList.add(rowList);  
        }  
        return dataList;  
    }  
	
	
	private String ConvertCellStr(Cell cell, String cellStr) {  
        switch (cell.getCellType()) {  
            case Cell.CELL_TYPE_STRING:  
                // 讀取String  
                cellStr = cell.getStringCellValue().toString();  
                break;  
            case Cell.CELL_TYPE_BOOLEAN:  
                // 得到Boolean對象的方法  
                cellStr = String.valueOf(cell.getBooleanCellValue());  
                break;  
            case Cell.CELL_TYPE_NUMERIC:  
                // 先看是否是日期格式  
                if (DateUtil.isCellDateFormatted(cell)) {  
                    // 讀取日期格式  
                    cellStr = formatTime(cell.getDateCellValue().toString());  
                } else {  
                    // 讀取數字  
                    cellStr = String.valueOf(cell.getNumericCellValue());  
                }  
                break;  
            case Cell.CELL_TYPE_FORMULA:  
                // 讀取公式  
                cellStr = cell.getCellFormula().toString();  
                break;  
        }  
        return cellStr;  
    }  
  


	private boolean isExcel2007(String fileName) {  
	        return fileName.matches("^.+\\.(?i)(xlsx)$");  
	 } 
	 
	private String formatTime(String s) {  
        SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);  
        Date date = null;  
        try {  
            date = sf.parse(s);  
        } catch (ParseException ex) {  
            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);  
        }  
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
        String result = sdf.format(date);  
        return result;  
    }  
	 
	
	/**
	 * Excel寫操作,簡單起見還是採用內存數據一次寫入
	 * @param filePath 輸出文件路徑名
	 * @param dataList 輸出文件內容,List<String>行  List列
	 * @throws IOException
	 */
	public 	void writeExcel(String filePath,List<List<String>> dataList) throws IOException{
	        HSSFWorkbook wb = new HSSFWorkbook();  
	        HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet  
	        // 表格樣式  
	        HSSFCellStyle style = wb.createCellStyle();  
	        style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定單元格居中對齊  
	        // // 邊框  
	        // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);  
	        // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);  
	        // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);  
	        // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);  
	        // //設置字體  
	        // HSSFFont f = wb.createFont();  
	        // f.setFontHeightInPoints((short)10);  
	        // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);  
	        // style.setFont(f);  
	        // //設置列寬  
	        // sheet.setColumnWidth((short)0, (short)9600);  
	        // sheet.setColumnWidth((short)1, (short)4000);  
	        // sheet.setColumnWidth((short)2, (short)8000);  
	        // sheet.setColumnWidth((short)3, (short)8000);  
	  
	        // 在索引0的位置創建第一行  
	  
	        for (int i = 0; i < dataList.size(); i++) {  
	            HSSFRow row = sheet.createRow(i);  
	            List<String> list = dataList.get(i);  
	            for (int j = 0; j < list.size(); j++) {  
	                HSSFCell cell = row.createCell(j);  
	                cell.setCellValue(list.get(j));  
	                cell.setCellStyle(style);  
	            }  
	        }  
	        // 導出文件  
	        FileOutputStream fout = new FileOutputStream(filePath);  
	        wb.write(fout);  
	        fout.close();  
	}
	
}

DataClean類,包含對html標籤,信息中url的的清洗。

package dat.datadeal;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * @author daT [email protected]
 *
 */
public class DataClean {
	
	/**
	 * 清洗html標籤
	 * @param inputString
	 * @return
	 */
	public static String delHtml(String inputString) {
        String htmlStr = inputString; // 含html標籤的字符串
        String textStr = "";
        java.util.regex.Pattern p_script;
        java.util.regex.Matcher m_script;
        java.util.regex.Pattern p_html;
        java.util.regex.Matcher m_html;
        try {
            String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表達式
            String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定義script的正則表達式{或<script[^>]*?>[/s/S]*?<//script>
            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
            m_script = p_script.matcher(htmlStr);
            htmlStr = m_script.replaceAll(""); // 過濾script標籤
            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
            m_html = p_html.matcher(htmlStr);
            htmlStr = m_html.replaceAll(""); // 過濾html標籤
            textStr = htmlStr;
        } catch (Exception e) {
            System.err.println("Html2Text: " + e.getMessage());
        }
        return textStr;// 返回文本字符串
    }
	
	/**
	 * 處理掉信息中的url地址
	 */
	public static String dealWithUrl(String str){
		String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";
        Pattern p = Pattern.compile(regEx);   
        Matcher m = p.matcher(str);
        return m.replaceAll("");
	}
	
	
	public static void main(String[] args) throws IOException{
		ExcelUtil excelUtil = new ExcelUtil();
		List<List<String>> writeList = new ArrayList<List<String>>();
		List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博數據_.xlsx");
		for(String[] lineArray:readList){
			List<String> strList = new ArrayList<String>();
			for(String str:lineArray){
				String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));
				strList.add(strTmp);
				//System.out.println(strTmp);
			}
			writeList.add(strList);
		}
		
		excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);  
	    System.out.println("job has finished...........");
	}
}


清洗後數據:


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章