抓取網頁中的數據

using System;

using System.Collections.Generic;

using System.Text;

using System.Data;

using System.Data.SqlClient;

using System.Collections;

using System.Text.RegularExpressions;

using System.IO;

using System.Net;



namespace CatchWeb

{

    public class CreateSql

    {

        

        public CreateSql()

        {

            

        }

        /// <summary>

        /// 獲取時間id

        /// </summary>

        public void CatchOtherID()

        {

            Log objlog = new Log();

            try

            {

                string PageText = "";

                string uri = "http://www.cnemc.cn/YeAll.aspx?id=2";//抓取時間id的網址

                TheCatchMethod tmd = new TheCatchMethod(uri);

                WebClient objClient = new WebClient();

                objClient.Credentials = CredentialCache.DefaultCredentials;

                Byte[] pageData = objClient.DownloadData(uri);

                PageText = Encoding.UTF8.GetString(pageData);

                string param = tmd.GetDateID(PageText);

                CatchOther(param);

            }

            catch(Exception ex)

            {

                objlog.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString + ex.Message);

            }

        }

        /// <summary>

        /// 抓取網頁數據信息 type = 1

        /// </summary>

        public void CatchOther(string param)

        {

            Log objlog = new Log();

            try

            {

                string PageText = "";



                string uri = "http://www.cnemc.cn/ReportCont.aspx?" + param; //+ System.DateTime.Now.AddDays(-1).ToShortDateString();

                TheCatchMethod tmd = new TheCatchMethod(uri);

                WebClient objClient = new WebClient();

                objClient.Credentials = CredentialCache.DefaultCredentials;

                Byte[] pageData = objClient.DownloadData(uri);

                PageText = Encoding.Default.GetString(pageData);

                

                //此處要更改

                bool bl = GetSql(tmd.GetDate(PageText), System.DateTime.Now.AddDays(-1).ToShortDateString(), "1");

                if (bl == true)

                {

                    objlog.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString() + "數據抓取成功!");

                }

            }

            catch (Exception ex)

            {

                objlog.WriteLog(System.DateTime.Now.AddDays(-1) + ex.Message);

            }

        }

        /// <summary>

        /// 抓取網頁數據信息 type = 0

        /// </summary>

        public void CatchBJ()

        {

            Log objlog = new Log();

            try

            {

                string PageText = "";



                string uri = "http://bjepb.gov.cn/air2008/Air.aspx?time=" + System.DateTime.Now.AddDays(-1).ToShortDateString();

                TheCatchMethod tmd = new TheCatchMethod(uri);

                WebClient objClient = new WebClient();

                objClient.Credentials = CredentialCache.DefaultCredentials;

                Byte[] pageData = objClient.DownloadData(uri);

                PageText = Encoding.Unicode.GetString(pageData);

                bool bl = GetSql(tmd.WebGet(PageText), System.DateTime.Now.AddDays(-1).ToShortDateString(),"0");

                if (bl == true)

                {

                    objlog.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString() + "數據抓取成功!");

                }

            }

            catch (Exception ex)

            {

                objlog.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString() + ex.Message);

            }

        }

        /// <summary>

        /// 構建sql

        /// </summary>

        /// <param name="arrs"></param>

        /// <returns></returns>

        public bool GetSql(ArrayList arrs,string date,string type)

        {

            ArrayList arrsqls = new ArrayList();

            foreach (ArrayList arr in arrs)

            {

                string strval = "";

                for (int i = 0; i < arr.Count; i++)

                {

                    if (arr[i].ToString() == "")

                    {

                        if (i == 1)

                        {

                            strval += "-1,";

                        }

                        else

                        {

                            strval += "null,";

                        }

                    }

                    else

                    {

                        if (i == 1)

                        {

                            strval += arr[i].ToString() + ",";

                        }

                        else

                        {

                            strval += "'" + arr[i].ToString() + "',";

                        }

                    }

                }

                strval += "'" + type + "'";

                string delsql = "delete 空氣質量日報 where 日期='" + date + "' and 子站='" + arr[0].ToString() + "';";

                string insertsql="insert into 空氣質量日報 values ('" + date + "'," + strval + ")";

                arrsqls.Insert(0, delsql);

                arrsqls.Add(insertsql);

            }

            return AlterSql(arrsqls);

        }



        /// <summary>

        /// 判讀是否可以導入 true 可以 false 不可以

        /// </summary>

        /// <param name="place"></param>

        /// <returns></returns>

        public bool IfCanImport(string place)

        {

            bool bl = false;

            string sql = "select * from 空氣質量日報 where  類別='1' and 子站='" + place + "'";

            AppData.DataAccess da = new AppData.DataAccess();

            DataTable dt = da.GetDataTable(sql);

            if (dt.Rows.Count > 0)

            {

                bl = true;

            }

            return bl;

        }

        /// <summary>

        /// 執行sql

        /// </summary>

        /// <param name="arrsqls"></param>

        /// <returns></returns>

        public bool AlterSql(ArrayList arrsqls)

        {

            AppData.DataAccess da = new AppData.DataAccess();

            return da.ExecuteCommand(arrsqls);

        }

    }

}
 
//通過正則取數據的小小框架,別人要用的時候可能需要改哦
using System;

using System.Collections.Generic;

using System.Text;

using System.Net;

using System.IO;

using System.Text.RegularExpressions;

using System.Data;

using System.Data.SqlClient;

using System.Collections;



namespace CatchWeb

{

    public class TheCatchMethod

    {

        private string uri;

        public TheCatchMethod(string uri)

        {

            this.uri = uri;

        }



        /// <summary>

        /// 網頁抓取Bj

        /// </summary>

        public ArrayList WebGet(string text)

        {

            Log lg = new Log();

            //要返回的DataTable

            ArrayList arrs = new ArrayList();

            text = text.Replace("/r/n", "");

            MatchCollection mathCltion = Regex.Matches(text, @"<table[^>]*>(?:/s|/S)*?</table>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

            try

            {

                if (mathCltion.Count > 1)

                {

                    string tabname = mathCltion[3].Groups[0].Value;

                    MatchCollection TrMatchs = Regex.Matches(tabname, @"<tr[^>]*>((/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*)</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    if (TrMatchs.Count > 1)

                    {

                        for (int i = 2; i < TrMatchs.Count; i++)

                        {

                            string trText = TrMatchs[i].Groups[0].Value;

                            MatchCollection TdMaths = Regex.Matches(trText, @"<td[^>]*>((/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                            if (TdMaths.Count >= 4)

                            {

                                ArrayList arr = new ArrayList();

                                for (int j = 0; j < TdMaths.Count; j++)

                                {

                                    arr.Add(TdMaths[j].Groups[0].Value.Replace("<td>", "").Replace("</td>", "").Replace("<font color=/"Black/">", "").Replace("</font>", ""));

                                }

                                arrs.Add(arr);

                            }

                        }

                    }

                }

            }

            catch (Exception ex)

            {

                lg.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString() + ex.Message);

            }

            return arrs;

        }



        /// <summary>

        /// 網頁抓取Other

        /// </summary>

        public ArrayList GetDate(string text)

        {

            Log lg = new Log();

            //要返回的DataTable

            ArrayList arrs = new ArrayList();

            text = text.Replace("/r/n", "");

            MatchCollection mathCltion = Regex.Matches(text, @"<table[^>]*>(?:/s|/S)*?</table>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

            try

            {

                if (mathCltion.Count > 4)

                {

                    string tabtext = mathCltion[4].Groups[0].Value;

                    MatchCollection TrMatchs = Regex.Matches(tabtext, @"<tr[^>]*>((/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*)</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    if (TrMatchs.Count > 1)

                    {

                        for (int i = 2; i < TrMatchs.Count; i++)

                        {

                            string trText = TrMatchs[i].Groups[0].Value;

                            MatchCollection TdMaths = Regex.Matches(trText, @"<td[^>]*>((/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                            if (TdMaths.Count >= 4)

                            {

                                ArrayList arr = new ArrayList();

                                for (int j = 0; j < TdMaths.Count; j++)

                                {

                                    //<td nowrap="nowrap" align="Left"><font face="宋體">鞍山</font></td>

                                    arr.Add(TdMaths[j].Groups[0].Value.Replace("<td nowrap=/"nowrap/" align=/"Left/"><font face=/"宋體/">", "").Replace("</font></td>", ""));

                                }

                                CreateSql cql = new CreateSql();

                                if (cql.IfCanImport(arr[0].ToString()))

                                {

                                    arrs.Add(arr);

                                }

                                else

                                {

                                    continue;

                                }

                            }

                        }

                    }

                }

            }

            catch(Exception ex)

            {

                lg.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString() + ex.Message);

            }

            return arrs;

        }

        

        /// <summary>

        /// 網頁抓取Other時間id

        /// </summary>

        public string GetDateID(string text)

        {

            Log lg = new Log();

            string id = "";

            //要返回的DataTable

            ArrayList arrs = new ArrayList();

            text = text.Replace("/r/n", "");

            //href//s*=//s*(?:/"(?<1>[^/"]*)/"|(?<1>//S+))  連接正則

            try

            {

                MatchCollection mathCltion = Regex.Matches(text, @"<table[^>]*>(?:/s|/S)*?</table>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                if (mathCltion.Count > 0)

                {

                    text = mathCltion[3].Groups[0].Value;

                    mathCltion = Regex.Matches(text, "href//s*=//s*(?:/"(?<1>[^/"]*)/"|(?<1>//S+))", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    if (mathCltion.Count > 0)

                    {

                        for (int i = 0; i < mathCltion.Count; i++)

                        {

                            //mathCltion[i].Groups[0].Value	href='ReportCont.aspx?type=2&id=311'>20080616



                            string temp = mathCltion[i].Groups[0].Value.Replace("</a></td><td><a", "");

                            string textYear = temp.Substring(temp.Length - 8, 4) + "-" + temp.Substring(temp.Length - 4, 2) + "-" + temp.Substring(temp.Length - 2, 2);

                            if (Convert.ToDateTime(textYear).ToShortDateString() == System.DateTime.Now.AddDays(-1).ToShortDateString())

                            {

                                id = temp.Replace("href='ReportCont.aspx?", "");//type=2&id=311'>20080616

                                id = id.Substring(0, id.Length - 10);

                                break;

                            }

                        }

                    }

                    else

                    {

                        

                    }

                }

            }

            catch (Exception ex)

            {

                lg.WriteLog(System.DateTime.Now.AddDays(-1).ToShortDateString() + ex.Message);

            }

            return id;

        }





    }

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章