衆所周知,讓用戶在富文本編輯器中進行自己的輸入絕對不是一個明智的選擇,但是有的時候又沒有辦法,所以只有一條原則來保證系統的安全性,那就是我們讓用戶輸入什麼,用戶才能輸入什麼,而不是用戶想輸入什麼,他就能輸入什麼,這樣才能讓系統處於我們的掌控,不至於出現各種婁子,比如各種XSS注入什麼的。
後來我們發現有一個比較好用的東西就是JSOUP,這是一個能夠對輸入的html進行過濾,簡單來說就是可以增加白名單和黑名單(基於正則表達式),白名單就是隻允許一個html標籤上有固定的屬性,比如我們只允許<div height="100" >,即div上只允許有height屬性,其他的都是非法的我們認爲,就可以用jsoup設置白名單進行過濾。我們也可以設置黑名單,即我們覺得<div>標籤什麼屬性都可以有,但是style標籤我們不能控制,認爲他是個黑名單,我們也可以用jsoup進行實現。
下面貼出一個樣例:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.json.JSONObject;
import net.sf.json.JsonConfig;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springside.modules.mapper.JsonMapper;
public class HTMLStringFilter {
private final static String regxpForHtml = "<([^>]*)>"; // 過濾所有以<開頭以>結尾的標籤
private final static String PICTURE = "[圖片]";
//private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>"; // 找出IMG標籤
//private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\""; // 找出IMG標籤的SRC屬性
public HTMLStringFilter() {
}
public static String HTMLEncode(String fString){
fString=fString.replaceAll(" <", "<");
fString=fString.replaceAll(">", ">");
fString=fString.replaceAll(new String(new char[]{32}), " ");
fString=fString.replaceAll(new String(new char[]{9}), " ");
fString=fString.replaceAll(new String(new char[]{34}), """);
fString=fString.replaceAll(new String(new char[]{39}), "'");
fString=fString.replaceAll(new String(new char[]{13}), "");
fString=fString.replaceAll(new String(new char[]{10,10}), " </p> <p>");
fString=fString.replaceAll(new String(new char[]{10}), " <br>");
return fString;
}
/**
* xss escape
*/
public static String xssEscape(String input) {
return input == null ? null : input.replaceAll("<", "<")
.replaceAll(">", ">")
// .replaceAll("eval\\((.*)\\)", "")
// .replaceAll("[\"'][\\s]*((?i)javascript):(.*)[\"']", "\"\"")
// .replaceAll("((?i)script)", "")
;
}
/**
* 除指定標籤之外的html標籤編碼
* @param str
* @param tag
* @return
*/
public static String xssEscapeExceptTag(String str,String tag) {
String replaceTag="@"+tag+"@";
str=str.replaceAll("<"+tag,replaceTag );
str=xssEscape(str);
str=str.replaceAll(replaceTag, "<"+tag);
return str;
}
public static void main(String[] args){
// System.out.println(new java.util.Date().getTime());
// System.out.println(HTMLStringFilter.filterSafe("< script >ddd</div>"));
// System.out.println(HTMLStringFilter.filterSafe("< div >ddd</div>"));
// System.out.println("======"+HTMLStringFilter.filterSafe("< div oncliCk=''><img src='http://s.jsp'/>ddd</div>"));
//
// String imgHTML="<img src=\"http:\"/>";
// String tag="img";
// System.out.println("filter except:"+filterHtmlExceptTag(imgHTML, tag));
//
// System.out.println(new java.util.Date().getTime());
//
// String source="aaaaa<img alt=\"[可愛]\" src=\"http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/14/tza_thumb.gif\" height=\"22\" width=\"22\" />bbbb<img alt=\"[給力]\" src=\"http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c9/geili_thumb.gif\" height=\"22\" width=\"22\" />ccc";
// String title=replaceTag(source, "img", "alt");
// System.out.println("title=="+title);
//
// String s="<img src=\"http://img7.9158.com/200708/10/09/18/200708103758836.jpg\"/>";
// List<String> srcs=match(source, "img", "src");
// if (CollectionUtils.isNotEmpty(srcs)) {
// for (String att : srcs) {
// System.out.println("attr=="+att);
// }
// }
//
// System.out.println("html標籤替換=="+replaceHtmlTagOfText(s, "img", "[圖片]"));
//
String htmlStr="<html>bb<img style='display:inline;' alt='[擠眼]' src='http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c3/zy_thumb.gif' height='22' width='22' />bb<img style='display:inline;' alt='[擠眼]' src='http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c3/zy_thumb.gif' height='22' width='22' />aaaa</html>";
List<String> srcs=getImgHTML(htmlStr);
for (String src : srcs) {
System.out.println("======="+src);
}
// System.out.println("=HTMLEncode=="+);
// List<String> htmls=getImgHTML(htmlStr);
// List<String> srcs=getImgSrc(htmlStr);
//
// System.out.println("--"+htmls.size()+"=="+srcs.size());
//
// for (String s : htmls) {
// System.out.println("----"+s);
// System.out.print(htmlStr.replaceFirst(s, "[圖一]"));
// }
// for (String s : srcs) {
// System.out.println("==="+s);
// }
}
/**
* 過濾一下字符串,連同前後< xxx >yyy< / xxx >全部消除。
* 不區分大小寫、空格可識別
* <br>"function", "window\\.", "javascript:", "script",
* <br>"js:", "about:", "file:", "document\\.", "vbs:", "frame",
* <br>"cookie", "onclick", "onfinish", "onmouse", "onexit=",
* <br>"onerror", "onclick", "onkey", "onload", "onfocus", "onblur"
* @param htmlStr
* @return
*/
public static String filterSafe(String htmlStr){
Pattern p = null; // 正則表達式
Matcher m = null; // 操作的字符串
StringBuffer tmp = null;
String str = "";
boolean isHave = false;
String[] Rstr = { "meta", "script", "object", "embed" };
if (htmlStr == null || !(htmlStr.length() > 0)) {
return "";
}
str = htmlStr.toLowerCase();
for (int i = 0; i < Rstr.length; i++) {
p = Pattern.compile("<" + Rstr[i] + "(.[^>])*>");
m = p.matcher(str);
tmp = new StringBuffer();
if (m.find()) {
m.appendReplacement(tmp, "<" + Rstr[i] + ">");
while (m.find()) {
m.appendReplacement(tmp, "<" + Rstr[i] + ">");
}
isHave = true;
}
m.appendTail(tmp);
str = tmp.toString();
p = Pattern.compile("</" + Rstr[i] + "(.[^>])*>");
m = p.matcher(str);
tmp = new StringBuffer();
if (m.find()) {
m.appendReplacement(tmp, "</" + Rstr[i] + ">");
while (m.find()) {
m.appendReplacement(tmp, "</" + Rstr[i] + ">");
}
isHave = true;
}
m.appendTail(tmp);
str = tmp.toString();
}
// System.out.println(str);
String[] Rstr1 = { "function", "window\\.", "javascript:", "script",
"js:", "about:", "file:", "document\\.", "vbs:", "frame",
"cookie", "onclick", "onfinish", "onmouse", "onexit=",
"onerror", "onclick", "onkey", "onload", "onfocus", "onblur" };
for (int i = 0; i < Rstr1.length; i++) {
p = Pattern.compile("<([^<>])*" + Rstr1[i] + "([^<>])*>([^<>])*</([^<>])*>");
m = p.matcher(str);
tmp = new StringBuffer();
if (m.find()) {
m.appendReplacement(tmp, "");
while (m.find()) {
m.appendReplacement(tmp, "");
}
isHave = true;
}
m.appendTail(tmp);
str = tmp.toString();
}
if (isHave) {
htmlStr = str;
}
htmlStr = htmlStr.replaceAll("%3C", "<");
htmlStr = htmlStr.replaceAll("%3E", ">");
htmlStr = htmlStr.replaceAll("%2F", "");
htmlStr = htmlStr.replaceAll("&#", "<b>&#</b>");
return htmlStr;
}
/**
* 採用jsoup白名單方式過濾非法的html字符。
* 原理:
* 1.首先通過白名單過濾掉非法的html標籤,即只允許輸出白名單內的標籤
* 2.對特殊的屬性(主要是style)用正則過濾,只允許安全的屬性值存在
* @param htmlStr 原始的html片段(用戶通過富文本編輯器提交的html代碼)
* @return 過濾後的安全的html片段
*/
public static String cleanSafeHtml(String htmlStr) {
Document doc = Jsoup.parseBodyFragment(htmlStr);
OutputSettings outSet = new OutputSettings();
outSet.prettyPrint(false);
outSet.outline(false);
doc.outputSettings(outSet);
Map<String, String> regexMap = initRegexMap();
if (regexMap != null) {
for (Map.Entry<String,String> entiy:regexMap.entrySet()){
String key = entiy.getKey();
Elements els = doc.select(key);
for (Element el:els) {
System.out.println("old el:"+el.toString());
String attribute = key.substring(key.indexOf("[")+1, key.indexOf("]"));
String attributeValue = el.attr(attribute);
Matcher valueMatcher = Pattern.compile(entiy.getValue()).matcher(attributeValue);
if (valueMatcher.find()) {
String safeValue = valueMatcher.group();
System.out.println("safeValue:"+safeValue);
el.attr(attribute, safeValue);
}
System.out.println("new el:"+el.toString());
}
}
}
Whitelist whitelist = initWhiteList();
String safeString = Jsoup.clean(doc.html(), "", whitelist);
System.out.println("safestring:"+safeString);
return safeString;
// Elements els = doc.select("[style]");
// for (Element el:els) {
// System.out.println("old el:"+el.toString());
// String styleattribute = el.attr("style");
// Matcher styleMatcher = Pattern.compile(styleAttributeRegex).matcher(styleattribute);
// if (styleMatcher.find()) {
// String safeStyle = styleMatcher.group();
// System.out.println("safeStyle:"+safeStyle);
// el.attr("style", safeStyle);
// }
// System.out.println("new el:"+el.toString());
// }
// Whitelist whitelist = Whitelist.relaxed();
// whitelist.addAttributes("span", "style");
// String safeString = Jsoup.clean(doc.html(), "", whitelist);
// System.out.println("safestring:"+safeString);
// return safeString;
}
private static Whitelist whitelist = null;
private static Whitelist initWhiteList() {
if (whitelist == null) {
synchronized(new Object()) {
whitelist = new Whitelist();
String jsonString = null;
Resource resource = new ClassPathResource("/data/whitelist.conf");
File file = null;
InputStream input = null;
Writer output = null;
try {
file = resource.getFile();
input = new FileInputStream(file);
output = new StringWriter();
IOUtils.copy(input, output);
jsonString = output.toString();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
if (input != null) {
IOUtils.closeQuietly(input);
}
if (output != null) {
IOUtils.closeQuietly(output);
}
}
JsonConfig config = new JsonConfig();
config.setIgnoreDefaultExcludes(true);//這裏不設置,會把class屬性過濾掉
JSONObject jsonObject = JSONObject.fromObject(jsonString,config);
JSONObject whitelistjson = jsonObject.getJSONObject("whiteList");
JSONObject protocolsjson = jsonObject.getJSONObject("protocols");
JsonMapper newMapper = new JsonMapper();
Map<String, Map<String, String>> whitelistmap = newMapper.fromJson(whitelistjson.toString(), HashMap.class);
Map<String, List<String>> protocolsmap = newMapper.fromJson(protocolsjson.toString(), HashMap.class);
for (Map.Entry<String, Map<String, String>> entiy:whitelistmap.entrySet()){
String tag = entiy.getKey();
whitelist.addTags(tag);
for (Map.Entry<String,String> entiy2:entiy.getValue().entrySet()){
String attribute = entiy2.getKey();
whitelist.addAttributes(tag, attribute);
System.out.println("value value:"+entiy2.getValue());
}
}
for (Map.Entry<String, List<String>> entiy:protocolsmap.entrySet()){
String tag = entiy.getKey().substring(0, entiy.getKey().indexOf("."));
String key = entiy.getKey().substring(entiy.getKey().indexOf(".")+1, entiy.getKey().length());
for (String entiy2:entiy.getValue()){
whitelist.addProtocols(tag, key, entiy2);
}
}
}
}
return whitelist;
}
private static Map<String, String> regexMap = null;
private static Map<String, String> initRegexMap() {
if (regexMap == null) {
synchronized (new Object()) {
regexMap = new HashMap<String, String>();
String jsonString = null;
Resource resource = new ClassPathResource("/data/whitelist.conf");
File file = null;
InputStream input = null;
Writer output = null;
try {
file = resource.getFile();
input = new FileInputStream(file);
output = new StringWriter();
IOUtils.copy(input, output);
jsonString = output.toString();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
if (input != null) {
IOUtils.closeQuietly(input);
}
if (output != null) {
IOUtils.closeQuietly(output);
}
}
JSONObject jsonObject = JSONObject.fromObject(jsonString);
JSONObject whitelistjson = jsonObject.getJSONObject("whiteList");
JsonMapper newMapper = new JsonMapper();
Map<String, Map<String, String>> whitelistmap = newMapper.fromJson(whitelistjson.toString(), HashMap.class);
for (Map.Entry<String, Map<String, String>> entiy:whitelistmap.entrySet()){
String tag = entiy.getKey();
for (Map.Entry<String,String> entiy2:entiy.getValue().entrySet()){
String attribute = entiy2.getKey();
String attributeValue = entiy2.getValue();
if (attributeValue != null && attributeValue.trim().length() > 0) {
regexMap.put(tag+"["+ attribute +"]", attributeValue);
}
}
}
}
}
return regexMap;
}
public static String filter(String input) {
if (!hasSpecialChars(input)) {
return input;
}
StringBuffer filtered = new StringBuffer(input.length());
char c;
for (int i = 0; i <= input.length() - 1; i++) {
c = input.charAt(i);
switch (c) {
case '<':
filtered.append("<");
break;
case '>':
filtered.append(">");
break;
case '"':
filtered.append("&uot;");
break;
case '&':
filtered.append("&");
break;
default:
filtered.append(c);
}
}
return (filtered.toString());
}
public static boolean hasSpecialChars(String input) {
boolean flag = false;
if ((input != null) && (input.length() > 0)) {
char c;
for (int i = 0; i <= input.length() - 1; i++) {
c = input.charAt(i);
switch (c) {
case '>':
flag = true;
break;
case '<':
flag = true;
break;
case '"':
flag = true;
break;
case '&':
flag = true;
break;
}
}
}
return flag;
}
/**
*
* 基本功能:過濾所有以"<"開頭以">"結尾的標籤
* <p>
*
* @param str
* @return String
*/
public static String filterHtml(String str) {
Pattern pattern = Pattern.compile(regxpForHtml);
Matcher matcher = pattern.matcher(str);
StringBuffer sb = new StringBuffer();
boolean result1 = matcher.find();
while (result1) {
matcher.appendReplacement(sb, "");
result1 = matcher.find();
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* 過濾除指定tag之外的html標籤
* @param str
* @param tag
* @return
*/
public static String filterHtmlExceptTag(String str,String tag) {
String replaceTag="@"+tag+"@";
str=str.replaceAll("<"+tag,replaceTag );
str=filterHtml(str);
str=str.replaceAll(replaceTag, "<"+tag);
return str;
}
/**
*
* 基本功能:過濾指定標籤
* <p>
*
* @param str
* @param tag
* 指定標籤
* @return String
*/
public static String fiterHtmlTag(String str, String tag) {
String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";
Pattern pattern = Pattern.compile(regxp);
Matcher matcher = pattern.matcher(str);
StringBuffer sb = new StringBuffer();
boolean result1 = matcher.find();
while (result1) {
matcher.appendReplacement(sb, "");
result1 = matcher.find();
}
matcher.appendTail(sb);
return sb.toString();
}
/**
*
* 基本功能:替換指定的標籤
* <p>
*
* @param str
* @param beforeTag
* 要替換的標籤
* @param tagAttrib
* 要替換的標籤屬性值
* @param startTag
* 新標籤開始標記
* @param endTag
* 新標籤結束標記
* @return String
* @如:替換img標籤的src屬性值爲[img]屬性值[/img]
*/
public static String replaceHtmlTag(String str, String beforeTag,
String tagAttrib, String startTag, String endTag) {
String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>";
String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\"";
Pattern patternForTag = Pattern.compile(regxpForTag);
Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib);
Matcher matcherForTag = patternForTag.matcher(str);
StringBuffer sb = new StringBuffer();
boolean result = matcherForTag.find();
while (result) {
StringBuffer sbreplace = new StringBuffer();
Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag
.group(1));
if (matcherForAttrib.find()) {
matcherForAttrib.appendReplacement(sbreplace, startTag
+ matcherForAttrib.group(1) + endTag);
}
matcherForTag.appendReplacement(sb, sbreplace.toString());
result = matcherForTag.find();
}
matcherForTag.appendTail(sb);
return sb.toString();
}
/**
* html標籤替換爲指定字符
* @param str
* @param tagAttrib
* @param beforeTag
* @param replace
* @return
*/
public static String replaceHtmlTagOfText(String str,String tag,String text) {
String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";
Pattern pattern = Pattern.compile(regxp);
Matcher matcher = pattern.matcher(str);
StringBuffer sb = new StringBuffer();
boolean result1 = matcher.find();
while (result1) {
matcher.appendReplacement(sb, text);
result1 = matcher.find();
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* 獲取指定HTML標籤的指定屬性的值
* @param source 要匹配的源文本
* @param element 標籤名稱
* @param attr 標籤的屬性名稱
* @return 屬性值列表
*/
public static List<String> match(String source, String element, String attr) {
List<String> result = new ArrayList<String>();
String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?\\s.*?>";
Matcher m = Pattern.compile(reg).matcher(source);
while (m.find()) {
String r = m.group(1);
result.add(r);
}
return result;
}
public static List<String> getImgHTML(String html) {
List<String> resultList=new ArrayList<String>();
Pattern p=Pattern.compile("<img ([^>]*)");//<img開頭 >結尾
Matcher m=p.matcher(html);//開始編譯
while (m.find()) {
resultList.add("<img "+m.group(1)+">");//獲取匹配的部分
}
return resultList;
}
public static List<String> getImgSrc(String htmlStr){
String img="";
Pattern p_image;
Matcher m_image;
List<String> pics = new ArrayList<String>();
String regEx_img = "<img.*src=(.*?)[^>]*?>"; //圖片鏈接地址
p_image = Pattern.compile
(regEx_img,Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while(m_image.find()){
img = m_image.group();
Matcher m = Pattern.compile("src=\"?(.*?)(\"|>|\\s+)").matcher(img); //匹配src
while(m.find()){
pics.add(m.group(1));
}
}
return pics;
}
public static List<String> getImgAlt(String htmlStr){
String img="";
Pattern p_image;
Matcher m_image;
List<String> alts = new ArrayList<String>();
String regEx_img = "<img.*src=(.*?)[^>]*?>"; //圖片鏈接地址
p_image = Pattern.compile
(regEx_img,Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while(m_image.find()){
img = m_image.group();
Matcher m = Pattern.compile("alt=\"?(.*?)(\"|>|\\s+)").matcher(img); //匹配src
while(m.find()){
alts.add(m.group(1));
}
}
return alts;
}
/**
*
* 基本功能:過濾所有以"<"開頭以">"結尾的標籤,但是替換爲空格
* <p>
*
* @param str
* @return String
*/
public static String filterHtmlWithSapce(String str) {
Pattern pattern = Pattern.compile(regxpForHtml);
Matcher matcher = pattern.matcher(str);
StringBuffer sb = new StringBuffer();
boolean result1 = matcher.find();
while (result1) {
matcher.appendReplacement(sb, " ");
result1 = matcher.find();
}
matcher.appendTail(sb);
return sb.toString();
}
}
{
"whiteList":{
"a":{"href":"","title":""},
"b":{},
"blockquote":{"cite":""},
"br":{},
"caption":{},
"cite":{},
"code":{},
"col":{"span":"","width":""},
"colgroup":{"span":"","width":""},
"dd":{},
"div":{},
"dl":{},
"dt":{},
"em":{},
"h1":{},
"h2":{},
"h3":{},
"h4":{},
"h5":{},
"h6":{},
"i":{},
"img":{"align":"", "alt":"", "height":"", "src":"", "title":"", "width":""},
"li":{"class":"","style":"/^text-align:\\s*(left|right|center);?\\s*$/i"},
"ol":{"start":"", "type":""},
"p":{"style":"/^text-align:\\s*(left|right|center);?\\s*$/i"},
"pre":{},
"q":{"cite":""},
"small":{},
"span":{"style":"/^\\s*font-family\\s*:\\s*(('|\\\"|"|')?(楷體|楷體_GB2312|宋體|微軟雅黑|黑體|,|\\s|\\w|sans-serif)('|\\\"|"|')?)+;?\\s*|\\s*(color|font-size|background-color)\\s*:\\s*(#\\w*|[\\w\\s]*|rgb\\s*\\(\\s*\\d+\\s*,\\s*\\d+\\s*,\\s*\\d+\\s*\\));?\\s*|\\s*text-decoration\\s*:\\s*(underline|overline|line-through|blink)\\s*;?\\s*$/i"},
"strike":{},
"strong":{},
"sub":{},
"sup":{},
"table":{"summary":"", "width":""},
"tbody":{},
"td":{"abbr":"", "axis":"", "colspan":"", "rowspan":"", "width":""},
"tfoot":{},
"th":{"abbr":"", "axis":"", "colspan":"", "rowspan":"", "scope":"","width":""},
"thead":{},
"tr":{},
"u":{},
"ul":{"type":"","class":"","style":"/^list-style-type:\\s*(decimal|disc);\\s*$/i"}
},
"protocols":{
"a.href":["ftp", "http", "https", "mailto"],
"blockquote.cite":["http", "https"],
"cite.cite":["http", "https"],
"img.src":["http", "https"],
"q.cite":["http", "https"]
}
}
即每個標籤的任何屬性,屬性的值我們都可以進行過濾和定製。
這樣,用戶輸入的任何東西都可以得到我們的控制。