package testlucene;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
public class SAXxhtml extends DefaultHandler {
/**
* Logger for this class
*/
private static final Logger logger = Logger.getLogger(SAXxhtml.class);
public StringBuffer sb = new StringBuffer();
public boolean usable = true;
private String sPath = "";
public SAXxhtml() {
super();
// TODO Auto-generated constructor stub
// PropertyConfigurator.configure("log4j.properties");
BasicConfigurator.configure();
}
public void startElement(String namespaceURI, String localName,
String rawName, Attributes atts) throws SAXException {
if (rawName.equals("style") || rawName.equals("script")) {
usable = false;
}
}
// 解析完成後的統計工作
public void endDocument() throws SAXException {
try {
PrintWriter pw = new PrintWriter(new FileOutputStream(sPath));
pw.print(sb.toString());
pw.flush();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void characters(char[] ch, int start, int length) {
String charEncontered = new String(ch, start, length);
/*
* if (!charEncontered.startsWith("<!")||!charEncontered.startsWith("<
* ")) { sb.append("\n"); sb.append(charEncontered); }
*/
if (usable) {
sb.append(charEncontered);
sb.append("\n");
}
usable = true;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
/*
* SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader
* xmlReader = null; SAXParser saxParser = null; try { //
* 創建一個解析器SAXParser對象 saxParser = spf.newSAXParser(); //
* 得到SAXParser中封裝的SAX XMLReader xmlReader = saxParser.getXMLReader();
* saxParser.parse(new File("d:/sina.xml"), new SAXxhtml()); } catch
* (Exception ex) { logger.error("main(String[]) - " + ex, ex);
* System.exit(1); }
*/
}
@Override
public void endElement(String arg0, String arg1, String arg2)
throws SAXException {
// TODO Auto-generated method stub
super.endElement(arg0, arg1, arg2);
}
public void parse(String sPath, String Scontent) {
this.sPath = sPath;
try {
// System.out.println(Scontent);
HtmlCleaner hc = new HtmlCleaner(Scontent);
hc.clean();
PrintWriter pw = new PrintWriter(new FileOutputStream("e:/tmpfile/tmp.txt"));
pw.print(sb.toString());
pw.flush();
pw.close();
FileInputStream fis = new FileInputStream(new File("e:/tmpfile/tmp.txt"));
String mid = hc.getBrowserCompactXmlAsString();
StringReader sr = new StringReader(mid);
InputSource iSrc = new InputSource(sr);
System.out.println(iSrc.toString());
SAXParserFactory spf = SAXParserFactory.newInstance();
XMLReader xmlReader = null;
SAXParser saxParser = null;
// 創建一個解析器SAXParser對象
saxParser = spf.newSAXParser();
// 得到SAXParser中封裝的SAX XMLReader
xmlReader = saxParser.getXMLReader();
saxParser.parse(fis, new SAXxhtml());
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
具體思路是Html->xml,然後就可以用sax對xml解析,但是程序總調不通,有人能幫助解決一下麼?