URL相對路徑轉絕對路徑
在做爬蟲或者網頁解析時,經常會從網頁中提取到很多相對URL,在做進一步爬取之前,我們需要將這個地址轉成完整的URL,其實URL和文件路徑是相似的,看似很簡單,實則有很多情況需要考慮,網上提供的很多方法都會有各種各樣的問題,這裏提供了一個相對靠譜的方法,代碼如下。
echo PhpUri::doIt("https://www.google.com/", "foo");
//輸出 https://www.google.com/foo
class PhpUri
{
public $scheme;
public $authority;
public $path;
public $query;
public $fragment;
/**
* 調用這個方法進行轉換
*
* @param $baseUrl
* @param $relativeUrl
* @return string
*/
public static function doIt($baseUrl, $relativeUrl)
{
return self::parse($baseUrl)->join($relativeUrl);
}
/**
* PhpUri constructor.
*/
public function __construct($url)
{
preg_match_all( '/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/',
$url, $matches);
$this->scheme = $matches[2][0];
$this->authority = $matches[4][0];
$this->path = $matches[5][0];
$this->query = $matches[7][0];
$this->fragment = $matches[9][0];
}
public static function parse($url)
{
$uri = new PhpUri($url);
if(empty($uri->path))
$uri->path = '/';
return $uri;
}
public function join($relative)
{
$uri = new PhpUri($relative);
switch (true)
{
case !empty($uri->scheme):
break;
case !empty($uri->authority):
break;
case empty($uri->path):
$uri->path = $this->path;
if (empty($uri->query))
{
$uri->query = $this->query;
}
break;
case strpos($uri->path, '/')===0:
break;
default:
$base_path = $this->path;
if (strpos( $base_path, '/' ) === FALSE)
{
$base_path = '';
}
else
{
$base_path = preg_replace('/\/[^\/]+$/', '/', $base_path);
}
if (empty($base_path) && empty($this->authority))
{
$base_path = '/';
}
$uri->path = $base_path.$uri->path;
}
if (empty($uri->scheme))
{
$uri->scheme = $this->scheme;
if (empty($uri->authority ))
{
$uri->authority = $this->authority;
}
}
return $uri->toStr();
}
private function toStr()
{
$ret = '';
if ( !empty( $this->scheme ) )
{
$ret .= "{$this->scheme}:";
}
if ( !empty( $this->authority ) )
{
$ret .= "//{$this->authority}";
}
$ret .= $this->normalizePath( $this->path );
if ( !empty( $this->query ) )
{
$ret .= "?{$this->query}";
}
if ( !empty( $this->fragment ) )
{
$ret .= "#{$this->fragment}";
}
return $ret;
}
private function normalizePath( $path )
{
if ( empty( $path ) )
{
return '';
}
$normalized_path = $path;
$normalized_path = preg_replace( '`//+`', '/', $normalized_path, -1, $c0 );
$normalized_path = preg_replace( '`^/\\.\\.?/`', '/', $normalized_path, -1, $c1 );
$normalized_path = preg_replace( '`/\\.(/|$)`', '/', $normalized_path, -1, $c2 );
/**
* CHANGE:
* @author Dominik Habichtsberg <[email protected]>
* @since 24 Mai 2015 10:05 Uhr
* changed limit form -1 to 1, because climbing up the directory-tree failed
*/
$normalized_path = preg_replace( '`/[^/]*?/\\.\\.(/|$)`', '/', $normalized_path, 1, $c3 );
$num_matches = $c0 + $c1 + $c2 + $c3;
return ( $num_matches > 0 ) ? $this->normalizePath( $normalized_path ) : $normalized_path;
}
}