URL地址相對路徑轉絕對路徑

原創

2020-02-21 01:20

URL相對路徑轉絕對路徑

在做爬蟲或者網頁解析時，經常會從網頁中提取到很多相對URL，在做進一步爬取之前，我們需要將這個地址轉成完整的URL，其實URL和文件路徑是相似的，看似很簡單，實則有很多情況需要考慮，網上提供的很多方法都會有各種各樣的問題，這裏提供了一個相對靠譜的方法，代碼如下。

echo PhpUri::doIt("https://www.google.com/", "foo");
//輸出 https://www.google.com/foo

class PhpUri
{
    public $scheme;
    public $authority;
    public $path;
    public $query;
    public $fragment;

    /**
     * 調用這個方法進行轉換
     *
     * @param $baseUrl
     * @param $relativeUrl
     * @return string
     */
    public static function doIt($baseUrl, $relativeUrl)
    {
        return self::parse($baseUrl)->join($relativeUrl);
    }

    /**
     * PhpUri constructor.
     */
    public function __construct($url)
    {
        preg_match_all( '/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/',
            $url, $matches);
        $this->scheme = $matches[2][0];
        $this->authority = $matches[4][0];
        $this->path = $matches[5][0];
        $this->query = $matches[7][0];
        $this->fragment = $matches[9][0];
    }


    public static function parse($url)
    {
        $uri = new PhpUri($url);
        if(empty($uri->path))
            $uri->path = '/';

        return $uri;
    }


    public function join($relative)
    {
        $uri = new PhpUri($relative);
        switch (true)
        {
            case !empty($uri->scheme):
                break;
            case !empty($uri->authority):
                break;
            case empty($uri->path):
                $uri->path = $this->path;
                if (empty($uri->query))
                {
                    $uri->query = $this->query;
                }
                break;
            case strpos($uri->path, '/')===0:
                break;
            default:
                $base_path = $this->path;
                if (strpos( $base_path, '/' ) === FALSE)
                {
                    $base_path = '';
                }
                else
                {
                    $base_path = preg_replace('/\/[^\/]+$/', '/', $base_path);
                }
                if (empty($base_path) && empty($this->authority))
                {
                    $base_path = '/';
                }
                $uri->path = $base_path.$uri->path;
        }


        if (empty($uri->scheme))
        {
            $uri->scheme = $this->scheme;
            if (empty($uri->authority ))
            {
                $uri->authority = $this->authority;
            }
        }
        return $uri->toStr();
    }



    private function toStr()
    {
        $ret = '';
        if ( !empty( $this->scheme ) )
        {
            $ret .= "{$this->scheme}:";
        }
        if ( !empty( $this->authority ) )
        {
            $ret .= "//{$this->authority}";
        }
        $ret .= $this->normalizePath( $this->path );
        if ( !empty( $this->query ) )
        {
            $ret .= "?{$this->query}";
        }
        if ( !empty( $this->fragment ) )
        {
            $ret .= "#{$this->fragment}";
        }
        return $ret;
    }



    private function normalizePath( $path )
    {
        if ( empty( $path ) )
        {
            return '';
        }
        $normalized_path = $path;
        $normalized_path = preg_replace( '`//+`', '/', $normalized_path, -1, $c0 );
        $normalized_path = preg_replace( '`^/\\.\\.?/`', '/', $normalized_path, -1, $c1 );
        $normalized_path = preg_replace( '`/\\.(/|$)`', '/', $normalized_path, -1, $c2 );
        /**
         * CHANGE:
         * @author Dominik Habichtsberg <[email protected]>
         * @since  24 Mai 2015 10:05 Uhr
         * changed limit form -1 to 1, because climbing up the directory-tree failed
         */
        $normalized_path = preg_replace( '`/[^/]*?/\\.\\.(/|$)`', '/', $normalized_path, 1, $c3 );
        $num_matches     = $c0 + $c1 + $c2 + $c3;
        return ( $num_matches > 0 ) ? $this->normalizePath( $normalized_path ) : $normalized_path;
    }
}