抓取新聞

/**
     * html內容
     * @param url html地址
     * @return null
     */
    public String parser(String url)
    {

           url="http://news.baidu.com/n?cmd=1&class=civilnews&tn=rss&sub=0";

        String parse = null;
        try
        {
            String content;
            // String title ;
            content = getOneHtml(url);
            // title = getTitle(content);
            parse = "\n" + getTab(getLink(getScript(getCSS(content))));

            parse = getText(parse);

        }
        catch (IOException e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return parse;
    }

/*    *//**
     * @param args
     *//*

    public static void main(String[] args)
    {
        ParserHtml ph = new ParserHtml();
       
         * String content = ""; String title = ""; String parse=""; try {
         * content = parser(); title = getTitle(content);
         * parse=getTab(getLink(getScript(getCSS(content))));
         * System.out.println(parse); parserText(parse); //
         * System.out.println(getCSS(content)); //System.out.println("title:" +
         * title); } catch (IOException e) { // TODO Auto-generated catch block
         * e.printStackTrace(); }
        
        ph.parser("D:\\aa.html");

    }*/
   

    /**
     * 取得標題
     * @param s 內容
     * @return null
     */
    public static String getTitle(final String s)
    {
        String regex;
        String title = "";
        final List<String> list = new ArrayList<String>();
        regex = "<title>.*?</title>";// 取得標題的正則表達式
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        StringBuffer sBuffer = new StringBuffer();
        while (ma.find())
        {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++)
        {
            // title = title + list.get(i);
            sBuffer.append(list.get(i));
        }
        title = sBuffer.toString();
        return outTag(title);
    }

    /**
     * 去掉所有的html標記
     * @param s 內容
     * @return null
     */
    public static String outTag(final String s)
    {
        return s.replaceAll("<.*?>", "");// 去掉所有的html標記
    }

    /**
     * 去掉所有的html樣式
     * @param s 內容
     * @return null
     */
    public static String getCSS(final String s)
    {
        String regex;
        String outCss = s;

        regex = "(<style|<STYLE).*?(</style>|<STYLE>)";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outCss = outCss.replace(ma.group(), "");
        }
        return outCss;

    }

    /**
     * 去掉所有的script腳本
     * @param s 內容
     * @return null
     */
    public static String getScript(final String s)
    {
        String outScript = s;
        String regex;

        // regex = "<script.*?</script>";
        regex = "(<script|<SCRIPT).*?(</script>|</SCRIPT>)";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outScript = outScript.replace(ma.group(), "");
        }
        return outScript;
    }

    /**
     * 去掉所有的html標記
     * @param s 內容
     * @return null
     */
    public static String getTab(final String s)
    {
        String outScript = s;
        String regex;

        regex = "<.*?>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outScript = outScript.replace(ma.group(), "");
            // outScript=outScript.replace(" ", "");
        }
        return outScript;
    }

    /**
     * 去掉所有的<a>標籤
     * @param s 內容
     * @return null
     */
    public static String getLink(final String s)
    {
        String outScript = s;
        String regex;

        regex = "(<a|<A).*?(</a>|</A>)";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outScript = outScript.replace(ma.group(), "");
            // outScript=outScript.replace(" ", "");
        }
        return outScript;
    }

    /**
     * 讀取一個網頁全部內容
     * @param htmlurl htmlurl
     * @return null
     * @throws IOException IOException
     */
    public String getOneHtml(final String htmlurl) throws IOException
    {
        URL url;
        String temp;
        final StringBuffer sb = new StringBuffer();
        try
        {
            url = new URL(htmlurl);
            final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 讀取網頁全部內容
            while ((temp = in.readLine()) != null)
            {
                sb.append(temp);
            }
            in.close();
        }
        catch (final MalformedURLException me)
        {
            System.out.println("你輸入的URL格式有問題!請仔細輸入");
            me.getMessage();
            throw me;
        }
        catch (final IOException e)
        {
            e.printStackTrace();
            throw e;
        }
        return sb.toString();
    }

    /**
     * 根據空格和文字長度,過濾非正文部分的文字
     * @param s 內容
     * @return null
     */
    public String getText(String s)
    {
        String[] array = s.split(" ");
        String str = "";
        String str2 = "";
        StringBuffer sBuffer = new StringBuffer();
        for (int i = 0; i < array.length; i++)
        {
            str2 = array[i].trim();
            if (str2.length() > Com.NUM_40)
            {
                // str += Array[i];
                sBuffer.append(array[i]);
                // System.out.println(str);
            }

        }
        str = sBuffer.toString();
        return str;
    }

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章