深感找喫的地方不方便,於是盟生了把水木food版的文章搬到手機上去的想法。
不過E60找不到軟件支持類似"桌面搜索"的文檔內容搜索功能,這樣面對上千篇從food版批量down下來的htm文章,找起來就相當喫力了。翻來弄去發現手機本身的自帶搜索功能可以搜短信和郵件,而且內容搜索也符合我的要求。
於是今天就弄了一天,怎麼把那些food版的批量下載後的htm文件,變成存在我手機上的電子郵件,以便出門在外也可以搜索。
整個過程還算順利:
第一步,先架一個smtp和pop3服務器。
window2003可以架,邊開我的2003虛擬機邊在網上搜有沒有更快的方法。
找到一個Foxmail Server for Windows 公開測試版,看起來直接就能用,就直接把我的虛擬機關掉了,太卡了。
然後按提示裝下來,拿outlook測一下,好用。贊~
第二步,把一個個文件用smtp發出爲一封封信。開始msdn找啊找,找到System.Net.Mail,搞定。
第三步,html格式不爽,想把那些tag都去掉,直接就能用手機看了。繼續msdn,找到一個System.Web.RegularExpressions下的TextRegex,完全不是那麼回事。再找,找到System.Web.HttpUtility.HtmlDecode(string),還有那麼點用,但tag還是刪不完。最後google之,找到了篇文章,rob代碼過來用了。
第四步,跑一下工程,發了一千多封信,outlook收一下正常。不過在手機上收就不那麼順利了,數字就不支持四位數的,而且一下子收太多的信會超時。只好100封100封地發,再用手機一點一點地收。
總算弄完了,希望以後會有用。睡了。zzzzzZZZZZ
using System.Collections.Generic;
using System.Collections;
using System.Text.RegularExpressions;
using System.Text;
using System.IO;
using System.Web;
using System.Web.RegularExpressions;
using System.Xml;
using System.Net.Mail;
namespace GetHtmlTitleFromFolder
{
class Program
{
static void Main(string[] args)
{
SmtpClient client = new SmtpClient("localhost", 8025);
string folder = @"C:/Documents and Settings/Administrator/My Documents/Food";
DirectoryInfo dir = new DirectoryInfo(folder);
Hashtable ht = GetFileTitle(dir);
FileInfo[] files = dir.GetFiles();
int step = 700;
int startNum = 301 + step;
int endNum = 400 + step;
for (int i = startNum; i < endNum; i++)
{
FileInfo file = files[i];
string fileName = file.Name;
string title = fileName.Substring(0, fileName.Length - 4) + ht[fileName].ToString();
StreamReader sr = new StreamReader(file.FullName, Encoding.Default);
string txt = sr.ReadToEnd();
string body = StripHTML(txt);
client.Send("[email protected]", "[email protected]", title, body);
}
}
public static Hashtable GetFileTitle(DirectoryInfo dir)
{
//folder = @"C:/Documents and Settings/Administrator/My Documents/Food";
FileInfo[] files = dir.GetFiles();
ArrayList list = new ArrayList();
Hashtable ht = new Hashtable();
foreach (FileInfo file in files)
{
StreamReader sr = new StreamReader(file.FullName, Encoding.Default);
string text = sr.ReadToEnd();
Regex reg = new Regex("<title>.*</title>");
Match m = reg.Match(text);
string s = m.Value;
string subStr = s.Substring(7, s.Length - 15);
StringBuilder sb = new StringBuilder(subStr);
sb = sb.Replace("●", ".");
sb = sb.Replace(" ", "");
sb = sb.Replace("?", "");
sb = sb.Replace("/", "");
sb = sb.Replace("zz", "");
sb = sb.Replace("Re:", "");
sb = sb.Replace("e:", "");
sb = sb.Replace("*", "=");
ht.Add(file.Name, sb.ToString());
}
return ht;
}
public static string StripHTML(string source)
{
try
{
string result;
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
result = source.Replace(" ", " ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace(" ", "====n====");
// Remove step-formatting
result = result.Replace(" ", string.Empty);
// Remove repeating speces becuase browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result,
@"( )+", " ");
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>", "<head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", "</head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<head>).*(</head>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<P ALIGN=).*(</P>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result,
// @"(<script>)([^(<script>.</script>)])*(</script>)",
// string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "<style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*style( )*>)", "</style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<style>).*(</style>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*br( )*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*li( )*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place
// if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*div([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*tr([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*p([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images,
// comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"•", " * ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™", "(tm)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"⁄", "/",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©", "(c)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®", "(r)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
//System.Text.RegularExpressions.Regex.Replace(result,
// this.txtRegex.Text,string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("", "");
// make line breaking consistent
result = result.Replace(" ", "");
// make line breaking consistent
result = result.Replace("====n====", " ");
// make line breaking consistent
result = result.Replace(":", ": ");
// make line breaking consistent
result = result.Replace(" ", " ");
// make line breaking consistent
result = result.Replace(" ", " ");
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween
// the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove multible tabs followind a linebreak with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Initial replacement target string for linebreaks
string breaks = " ";
// Initial replacement target string for tabs
string tabs = " ";
for (int index = 0; index < result.Length; index++)
{
result = result.Replace(breaks, " ");
result = result.Replace(tabs, " ");
breaks = breaks + " ";
tabs = tabs + " ";
}
// Thats it.
return result.Substring(1);
}
catch
{
System.Windows.Forms.MessageBox.Show("Error");
return source;
}
}
}
}