在網絡上可以看到很多實現此功能的函數。但算法多是循環判斷,當字符串較大時效率極低。
爲此這裏介紹兩個高效的函數:c_substr、m_substr。他們的用法完全與substr和mb_substr相同。不同之處在於:c_substr按字節計算,即一個漢字的長度爲2;m_substr按字計算,即一個漢字的長度爲1。可根據需要選用。
代碼:
function c_substr($str,$start=0) {
$ch = chr(127);
$p = array("/[/x81-/xfe]([/x81-/xfe]|[/x40-/xfe])/","/[/x01-/x77]/");
$r = array("","");
if(func_num_args() > 2)
$end = func_get_arg(2);
else
$end = strlen($str);
if($start < 0)
$start += $end;
if($start > 0) {
$s = substr($str,0,$start);
if($s[strlen($s)-1] > $ch) {
$s = preg_replace($p,$r,$s);
$start += strlen($s);
}
}
$s = substr($str,$start,$end);
$end = strlen($s);
if($s[$end-1] > $ch) {
$s = preg_replace($p,$r,$s);
$end += strlen($s);
}
return substr($str,$start,$end);
}
function m_substr($str,$start) {
preg_match_all("/[/x80-/xff]?./",$str,$ar);
if(func_num_args() >= 3) {
$end = func_get_arg(2);
return join("",array_slice($ar[0],$start,$end));
}else
return join("",array_slice($ar[0],$start));
}
$ch = chr(127);
$p = array("/[/x81-/xfe]([/x81-/xfe]|[/x40-/xfe])/","/[/x01-/x77]/");
$r = array("","");
if(func_num_args() > 2)
$end = func_get_arg(2);
else
$end = strlen($str);
if($start < 0)
$start += $end;
if($start > 0) {
$s = substr($str,0,$start);
if($s[strlen($s)-1] > $ch) {
$s = preg_replace($p,$r,$s);
$start += strlen($s);
}
}
$s = substr($str,$start,$end);
$end = strlen($s);
if($s[$end-1] > $ch) {
$s = preg_replace($p,$r,$s);
$end += strlen($s);
}
return substr($str,$start,$end);
}
function m_substr($str,$start) {
preg_match_all("/[/x80-/xff]?./",$str,$ar);
if(func_num_args() >= 3) {
$end = func_get_arg(2);
return join("",array_slice($ar[0],$start,$end));
}else
return join("",array_slice($ar[0],$start));
}
Windix Feng 也改寫了一個UTF-8適用的,也一起記錄。留待以後用再看看。
function utf8_substr2($str,$start) {
/*
UTF-8 version of substr(), for people who can't use mb_substr() like me.
Length is not the count of Bytes, but the count of UTF-8 Characters
Author: Windix Feng
Bug report to: windix(AT)263.net, http://www.douzi.org/blog
- History -
1.0 2004-02-01 Initial Version
2.0 2004-02-01 Use PREG instead of STRCMP and cycles, SPEED UP!
*/
preg_match_all("/[/x01-/x7f]|[/xc2-/xdf][/x80-/xbf]|/xe0[/xa0-/xbf][/x80-/xbf]|[/xe1-/xef][/x80-/xbf][/x80-/xbf]|/xf0[/x90-/xbf][/x80-/xbf][/x80-/xbf]|[/xf1-/xf7][/x80-/xbf][/x80-/xbf][/x80-/xbf]/", $str, $ar);
if(func_num_args() >= 3) {
$end = func_get_arg(2);
return join("",array_slice($ar[0],$start,$end));
} else {
return join("",array_slice($ar[0],$start));
}
}
/*
UTF-8 version of substr(), for people who can't use mb_substr() like me.
Length is not the count of Bytes, but the count of UTF-8 Characters
Author: Windix Feng
Bug report to: windix(AT)263.net, http://www.douzi.org/blog
- History -
1.0 2004-02-01 Initial Version
2.0 2004-02-01 Use PREG instead of STRCMP and cycles, SPEED UP!
*/
preg_match_all("/[/x01-/x7f]|[/xc2-/xdf][/x80-/xbf]|/xe0[/xa0-/xbf][/x80-/xbf]|[/xe1-/xef][/x80-/xbf][/x80-/xbf]|/xf0[/x90-/xbf][/x80-/xbf][/x80-/xbf]|[/xf1-/xf7][/x80-/xbf][/x80-/xbf][/x80-/xbf]/", $str, $ar);
if(func_num_args() >= 3) {
$end = func_get_arg(2);
return join("",array_slice($ar[0],$start,$end));
} else {
return join("",array_slice($ar[0],$start));
}
}