大文件字符串搜索之Java函數和調用Shell搜索的效率測試

1G的日誌文件,需要從中搜索出指定字符串所在的行和行中位置。有兩種方法,一種直接使用java的函數,一種通過調用Linux shell命令輔助處理。下面是示例程序:

# cat TestIO.java

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Pattern;
public class TestIO
{
        private int lineNum = 0;
        private String path = "";
        private String searchStr = "";
        public void setPath(String value)
        {
                path = value;
        }
        public String getPath()
        {
                return path;
        }
        public void setSearchStr(String value)
        {
                searchStr = value;
        }
        public String getSearchStr()
        {
                return searchStr;
        }
        /**
         * Java search by index
         */
        public void start()
        {
                if(null == path || path.length()<1)
                        return;
                try
                {
                        long startMili=System.currentTimeMillis();
                        System.out.println("Start search \""+searchStr+"\" in file: "+path);
                        File file = new File(path);
                        BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file));
                        BufferedReader reader = new BufferedReader(new InputStreamReader(fis,"utf-8"));
                        String line = "";
                        lineNum = 0;
                        while((line = reader.readLine()) != null)
                        {
                                lineNum ++;
                                String rs = this.searchStr(line, searchStr);
                                if(rs.length()>0)
                                {
                                //      System.out.println("Find in Line["+lineNum+"], index: "+rs);
                                }
                        }
                        System.out.println("Finished!");
                        long endMili=System.currentTimeMillis();
                        System.out.println("Total times: "+(endMili-startMili)+" ms");
                        System.out.println("");
                }
                catch(Exception e)
                {
                        e.printStackTrace();
                }
        }
        /**
         * Call shell command to search
         */
        public void startByShell()
        {
                try
                {
                        long startMili=System.currentTimeMillis();
                        System.out.println("Start search \""+searchStr+"\" in file: "+path+ " by shell");
                        String[] cmd = {"/bin/sh", "-c", "grep "+searchStr+" "+path+" -n "};
                        Runtime run = Runtime.getRuntime();
                        Process p = run.exec(cmd);
                        BufferedInputStream in = new BufferedInputStream(p.getInputStream());
                        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
                        String line = "";
                        lineNum = 0;
                        while((line = reader.readLine()) != null)
                        {
                                lineNum ++;
                                String rs = this.searchStr(line.substring(line.indexOf(':')+1), searchStr);
                                if(rs.length()>0)
                                {
                                        String linebyshell = line.substring(0, line.indexOf(':'));
                                        //System.out.println("Find in Line["+linebyshell+"], index: "+rs);
                                }
                        }
                        System.out.println("Finished!");
                        long endMili=System.currentTimeMillis();
                        System.out.println("Total times: "+(endMili-startMili)+" ms");
                        System.out.println("");
                }
                catch(Exception e)
                {
                        e.printStackTrace();
                }
        }
        public String searchStr(String src, String value)
        {
                String result = "";
                int index = src.indexOf(value,0);
                while(index>-1)
                {
                        result+=index+",";
                        index = src.indexOf(value,index+value.length());
                }
                return result;
        }
        public static boolean isNumeric(String str)
        {
            Pattern pattern = Pattern.compile("[0-9]*");
            return pattern.matcher(str).matches();
         }
        /**
         * @param args
         */
        public static void main(String[] args)
        {
                String file = "./testfile.txt";
                TestIO test = new TestIO();
                if(args.length>0)
                        test.setPath(args[0]);
                else
                        test.setPath(file);
                if(args.length>1)
                        test.setSearchStr(args[1]);
                else
                        test.setSearchStr("hello");
                test.start();
                test.startByShell();
        }
}


測試文件1.4G,百萬條日誌記錄。其中

關鍵字hello只有不到50條記錄;

chipkill佔20%左右記錄數;

error佔50%左右記錄數;

mainbuild166佔99%左右記錄數;


測試結果:

[root@mainbuild166 io]# java TestIO ./testfile.txt hello

Start search "hello" in file: ./testfile.txt

Finished!

Total times: 7825 ms


Start search "hello" in file: ./testfile.txt by shell

Finished!

Total times: 3080 ms


[root@mainbuild166 io]# java TestIO ./testfile.txt chipkill

Start search "chipkill" in file: ./testfile.txt

Finished!

Total times: 8760 ms


Start search "chipkill" in file: ./testfile.txt by shell

Finished!

Total times: 3732 ms


[root@mainbuild166 io]# java TestIO ./testfile.txt error

Start search "error" in file: ./testfile.txt

Finished!

Total times: 11339 ms


Start search "error" in file: ./testfile.txt by shell

Finished!

Total times: 8163 ms


[root@mainbuild166 io]# java TestIO ./testfile.txt mainbuild166

Start search "mainbuild166" in file: ./testfile.txt

Finished!

Total times: 9938 ms


Start search "mainbuild166" in file: ./testfile.txt by shell

Finished!

Total times: 12531 ms


從以上測試結果中可以看出,當結果集遠小於數據集時,採用調用shell的方法效率遠比直接使用java函數的高,這也是很符合實際中的情況的。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章