利用開源js引擎rhino+jsoup進行web裁製,使用javascript來解析頁面。
JsEngin.java
package cn.tailor.engin;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import org.jsoup.Jsoup;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.ScriptableObject;
import org.mozilla.javascript.commonjs.module.provider.SoftCachingModuleScriptProvider;
public class JsEngin {
public static String exedom(){
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("js");
try {
engine.put("oText", Fetch.getHtml("http://www.baidu.com"));
FileReader reader = new FileReader("D://js/yourFile.js");
engine.eval(reader);
reader.close();
String name = (String) engine.get("output");
return name;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static String exe(){
Context localContext = Context.enter();
ScriptableObject localScriptableObject = localContext.initStandardObjects();
Object Jsoup = Context.javaToJS(Jsoup.class, localScriptableObject);
ScriptableObject.putProperty(localScriptableObject, "jsoup", Jsoup);
ScriptableObject.putProperty(localScriptableObject, "dom", Fetch.getHtml("http://www.baidu.com"));
FileReader reader;
try {
reader = new FileReader("D://js/yourFile.js");
localContext.evaluateReader(localScriptableObject, reader, reader.toString(), 1, null);
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String html=(String) localScriptableObject.get("output");
return html;
}
}
TailorService.java
package cn.tailor.service;
import java.io.IOException;
import java.io.PrintWriter;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import cn.tailor.engin.JsEngin;
public class TailorService extends HttpServlet {
/**
* Constructor of the object.
*/
public TailorService() {
super();
}
/**
* Destruction of the servlet. <br>
*/
public void destroy() {
super.destroy(); // Just puts "destroy" string in log
}
/**
* The doGet method of the servlet. <br>
*
* This method is called when a form has its tag value method equals to get.
*
* @param request the request send by the client to the server
* @param response the response send by the server to the client
* @throws ServletException if an error occurred
* @throws IOException if an error occurred
*/
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
response.setContentType("text/html;charset=utf-8");
PrintWriter out = response.getWriter();
String html=JsEngin.exe();
out.print(html);
out.flush();
out.close();
}
/**
* The doPost method of the servlet. <br>
*
* This method is called when a form has its tag value method equals to post.
*
* @param request the request send by the client to the server
* @param response the response send by the server to the client
* @throws ServletException if an error occurred
* @throws IOException if an error occurred
*/
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
response.setContentType("text/html");
PrintWriter out = response.getWriter();
out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
out.println("<HTML>");
out.println(" <HEAD><TITLE>A Servlet</TITLE></HEAD>");
out.println(" <BODY>");
out.print(" This is ");
out.print(this.getClass());
out.println(", using the POST method");
out.println(" </BODY>");
out.println("</HTML>");
out.flush();
out.close();
}
/**
* Initialization of the servlet. <br>
*
* @throws ServletException if an error occurs
*/
public void init() throws ServletException {
}
}
Fetch.java
package cn.tailor.engin;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class Fetch {
public static Document getHtml(String url){
try {
Document dom=Jsoup.connect(url).timeout(6000).get();
return dom;
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
}
yourfile.js
var output=dom.getElementsByTag("title").get(0).text();