C45的.data和.names文件轉化成Arff數據

package cn.ac.ict.ics.utils;


import lombok.Cleanup;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.StringTokenizer;
import java.util.Vector;


/**
 * Created by qibaoyuan on 13-8-21.
 */
public class ArffConverter {




    public void ConvertCommonFile2Arff(String file) throws Exception {


        String header = file + ".names";
        String data = file + ".data";
        String arff = file + ".arff";
        String[] lines = readArrayOfStringsFromFile(header);
        boolean ogotClass = false;
        String classLine = "";


        @Cleanup
        BufferedWriter bw = null;
        try {


            bw = Files.newBufferedWriter(Paths.get(arff), Charset.forName("UTF-8"));
        } catch (IOException e) {
            e.printStackTrace();
        }
        assert bw != null;
        System.out.println("@relation '" + header + "_" + data + "'\n");
        bw.write("@relation '" + header + "_" + data + "'\n" + "\r\n");




        int nnumberOfAttributes = 0;
        for (int i = 0; i < lines.length; i++) {
            //skip comments
            String line = lines[i].trim();
            line = line.replaceAll(" ", "");
            if (line.startsWith("|") || line.equals("")) {
                continue;
            }




            StringTokenizer stringTokenizer = new StringTokenizer(line, ":");
            //Print.dialog("tokens " + stringTokenizer.countTokens());
            if (stringTokenizer.countTokens() != 2) {
                System.err.println("Error parsing line:\n" + line);
            }


            String attributeName = stringTokenizer.nextToken();
            String values = stringTokenizer.nextToken();
            if (i == lines.length - 1) {
                //class
                System.out.println(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}));
                bw.write(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}) + "\r\n");
                nnumberOfAttributes++;
            } else {
                if (values.endsWith("continuous.")) {
                    System.out.println("@attribute " + attributeName + " numeric");
                    bw.write("@attribute " + attributeName + " numeric" + "\r\n");
                    nnumberOfAttributes++;
                } else {
                    //nominal values
                    System.out.println(toWekaFormat(attributeName, nominalValues(values)));
                    bw.write(toWekaFormat(attributeName, nominalValues(values)) + "\r\n");
                    nnumberOfAttributes++;
                }
            }
        }




        System.out.println(classLine + "\n\n" + "@data\n");
        bw.write(classLine + "\n\n" + "@data\n" + "\r\n");


        @Cleanup
        BufferedReader dataBr = Files.newBufferedReader(Paths.get(data), Charset.forName("UTF-8"));


        String line = null;
        int counter = 0;
        while (null != (line = dataBr.readLine())) {
            assert null != line;
            if (line.trim().startsWith("|")) {
                continue;
            }
            int label = Integer.parseInt(line.substring(line.lastIndexOf(",") + 1));
            if (label < 0)
                line = line.substring(0, line.lastIndexOf(",")) + ",-1";
            else if (label == 0)
                line = line.substring(0, line.lastIndexOf(",")) + ",0";
            else
                line = line.substring(0, line.lastIndexOf(",")) + ",1";
            if (counter++ % 5000 == 0)
                System.out.println("processed:" + counter);
            //System.out.println(formatDataLine(line, nnumberOfAttributes));
            bw.write(formatDataLine(line.replace(" ", ""), nnumberOfAttributes) + "\r\n");
        }


    }


    private static String formatDataLine(String line, int nnumberOfAttributes) {
        StringTokenizer stringTokenizer = new StringTokenizer(line, ",");
        int n = stringTokenizer.countTokens();
        if (n != nnumberOfAttributes) {
            System.err.println("# attributes should be " + nnumberOfAttributes +
                    " but it's " + n + " in line " + line);
        }
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < n - 1; i++) {
            stringBuffer.append(stringTokenizer.nextToken().trim() + ",");
        }
        //I had to use the line below for adult.test because someone added a dot,
        //which is not present in adult.data (the training part)
        //stringBuffer.append(takeDot(stringTokenizer.nextToken()));
        stringBuffer.append(stringTokenizer.nextToken().trim());
        return stringBuffer.toString();
    }


    private static String takeDot(String last) {
        last = last.trim();
        last = last.substring(0, last.length() - 1);
        return last;
    }


    private static String toWekaFormat(String attributeName, String[] nominalValues) {
        String out = "@attribute " + attributeName + " {";
        for (int i = 0; i < nominalValues.length - 1; i++) {
            out += nominalValues[i].trim() + ",";
        }
        out += takeDot(nominalValues[nominalValues.length - 1]) + "}";
        return out;
    }


    private static String[] nominalValues(String line) {
        StringTokenizer stringTokenizer = new StringTokenizer(line, ",");
        int n = stringTokenizer.countTokens();
        if (n < 2) {
            System.err.println("Problem parsing line:\n" + line);
        }
        String[] out = new String[n];
        for (int i = 0; i < n; i++) {
            out[i] = stringTokenizer.nextToken();
        }
        return out;
    }


    public static String[] readArrayOfStringsFromFile(String fileName) {
        Vector v = readVectorOfStringsFromFile(fileName);
        if (v.size() < 1) {
            return null;
        }
        String[] out = new String[v.size()];
        for (int i = 0; i < out.length; i++) {
            out[i] = (String) v.elementAt(i);
        }
        return out;
    }


    public static Vector readVectorOfStringsFromFile(String filename) {


        if (filename == null) {
            System.err.println("Passed a string that is null !");
        }


        Vector vectorOfStrings = new Vector();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(filename));
            String s = null;
            int max = 90;
            int i = 0;
            while ((s = bufferedReader.readLine()) != null) {
                if (i++ > max) break;
                if (s.trim().equals("")) {
                    System.err.println("Skipped blank line");
                } else {
                    vectorOfStrings.addElement(s);
                }
            }
            bufferedReader.close();
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("Problem reading file " + filename);
        }
        return vectorOfStrings;
    }
}

測試用例

package cn.ac.ict.ics.utils;

import org.junit.Test;

/**
 * Created by qibaoyuan on 13-8-21.
 */
public class ArffConverterTest {
    @Test
    public void testConvertCommonFile2Arff() throws Exception {
        ArffConverter arffConverter = new ArffConverter();
        String file = "/Users/user/corpus/" +
                "airlines/airline_14col";
        arffConverter.ConvertCommonFile2Arff(file);
    }
}


發佈了409 篇原創文章 · 獲贊 21 · 訪問量 85萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章