第一個是動態網站的各個鏈接生成網站地圖(據報道google,microsoft和yahoo聯合聲明一個統一的標準sitemap 0.9,事實上目前只有google一家可以提交網站地圖, 參見:http://www.google.com/support/webmasters/bin/answer.py?answer=40318&hl=zh_CN),站點地圖範本如下:
<?xml version="1.0" encoding="UTF-8"?>
< urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
< url>
< loc>http://www.example.com/</loc>
< lastmod>2005-01-01</lastmod>
< changefreq>monthly</changefreq>
< priority>0.8</priority>
</url>
</urlset>
我的做法是用一張表記錄點擊的鏈接,再寫一個頁面來生成網站地圖(存儲地圖文件的目錄需要目錄寫授權)文件Sitemap.xml,代碼如下:
* 生成網站地圖Sitemap.xml
* sid:網站代碼
*/
private void CreateXMLFile(string sid)
{
SqlParameter param1 = new SqlParameter("@SID", SqlDbType.VarChar, 20);
param1.Value = sid;
IDataParameter[] parameters = new IDataParameter[] { param1 };
DbHelperSQL dbHelper = new DbHelperSQL(connStr);
string outParams = "";
DataSet ds = dbHelper.RunProcedure("spGetSiteMap", parameters, "TmpSiteMapInfo", ref outParams);
if (ds.Tables[0].Rows.Count > 0)
{
string XMLSpace = "http://www.google.com/schemas/sitemap/0.9";
DateTime dt = System.DateTime.Now;
XmlText xmltext;
XmlElement xmlelem;
// Create a new, empty document.
XmlDocument doc = new XmlDocument();
XmlDeclaration docNode = doc.CreateXmlDeclaration("1.0", "UTF-8", null);
doc.AppendChild(docNode);
// Create and insert a new element.
XmlNode urlset = doc.CreateNode(XmlNodeType.Element, "urlset", XMLSpace);
doc.AppendChild(urlset);
foreach (DataRow dr in ds.Tables[0].Rows)
{
// Create a nested element (with an attribute).
XmlElement url = doc.CreateElement("", "url", XMLSpace);
urlset.AppendChild(url);
xmlelem = doc.CreateElement("", "loc", XMLSpace);
xmltext = doc.CreateTextNode(dr["URL"].ToString());
xmlelem.AppendChild(xmltext);
url.AppendChild(xmlelem);
xmlelem = doc.CreateElement("", "lastmod", XMLSpace);
xmltext = doc.CreateTextNode(string.Format("{0:u}", dt).Substring(0,10));
xmlelem.AppendChild(xmltext);
url.AppendChild(xmlelem);
xmlelem = doc.CreateElement("", "changefreq", XMLSpace);
if (dr["Type"].ToString() == "1")
xmltext = doc.CreateTextNode("daily");
else
xmltext = doc.CreateTextNode("monthly");
xmlelem.AppendChild(xmltext);
url.AppendChild(xmlelem);
xmlelem = doc.CreateElement("", "priority", XMLSpace);
xmltext = doc.CreateTextNode(dr["OrderNo"].ToString());
xmlelem.AppendChild(xmltext);
url.AppendChild(xmlelem);
}
doc.Save(Server.MapPath("Sitemap.xml"));
}
return;
}
上面的代碼比較簡單,一個要點是如果上面結點(比如:urlset)帶有命名空間,則下層結點也一定要帶,否則下層結點會自動帶一個空的命名空間(好像與習慣思維相反,這點花了偶不少時間)。
第二個是robots(爬蟲用的配置文件),也有相關標準,網上資料很多,下面是我寫的生成robots文件的代碼:
{
SqlParameter param1 = new SqlParameter("@SID", SqlDbType.VarChar, 20);
param1.Value = sid;
IDataParameter[] parameters = new IDataParameter[] { param1 };
DbHelperSQL dbHelper = new DbHelperSQL(connStr);
string outParams = "";
DataSet ds = dbHelper.RunProcedure("spGetSiteMap", parameters, "TmpSiteMapInfo", ref outParams);
if (ds.Tables[0].Rows.Count > 0)
{
FileStream fs = new FileStream(Server.MapPath("robots.txt"), FileMode.OpenOrCreate, FileAccess.Write);
StreamWriter m_streamWriter = new StreamWriter(fs);
m_streamWriter.Flush();
// 使用StreamWriter來往文件中寫入內容
m_streamWriter.BaseStream.Seek(0, SeekOrigin.Begin);
// 把richTextBox1中的內容寫入文件
m_streamWriter.WriteLine("# Robots.txt file from http://www.hugesoft.net");
m_streamWriter.WriteLine("# All robots will spider the domain");
m_streamWriter.WriteLine("");
m_streamWriter.WriteLine("Sitemap: http://www.hugesoft.net/Sitemap.xml");
m_streamWriter.WriteLine("User-agent: *");
m_streamWriter.WriteLine("Disallow: ");
foreach (DataRow dr in ds.Tables[0].Rows)
{
string str = dr["URL"].ToString().ToLower();
int index = str.IndexOf("http://");
if (index < 0)
continue;
index = str.IndexOf("/",index + 7);
if (index < 0)
continue;
str = str.Substring(index);
m_streamWriter.WriteLine("Allow: " + str);
}
//關閉此文件
m_streamWriter.Flush();
m_streamWriter.Close();
}
}