URL地址相对路径转绝对路径

URL相对路径转绝对路径

在做爬虫或者网页解析时,经常会从网页中提取到很多相对URL,在做进一步爬取之前,我们需要将这个地址转成完整的URL,其实URL和文件路径是相似的,看似很简单,实则有很多情况需要考虑,网上提供的很多方法都会有各种各样的问题,这里提供了一个相对靠谱的方法,代码如下。

echo PhpUri::doIt("https://www.google.com/", "foo");
//输出 https://www.google.com/foo
class PhpUri
{
    public $scheme;
    public $authority;
    public $path;
    public $query;
    public $fragment;

    /**
     * 调用这个方法进行转换
     *
     * @param $baseUrl
     * @param $relativeUrl
     * @return string
     */
    public static function doIt($baseUrl, $relativeUrl)
    {
        return self::parse($baseUrl)->join($relativeUrl);
    }

    /**
     * PhpUri constructor.
     */
    public function __construct($url)
    {
        preg_match_all( '/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/',
            $url, $matches);
        $this->scheme = $matches[2][0];
        $this->authority = $matches[4][0];
        $this->path = $matches[5][0];
        $this->query = $matches[7][0];
        $this->fragment = $matches[9][0];
    }


    public static function parse($url)
    {
        $uri = new PhpUri($url);
        if(empty($uri->path))
            $uri->path = '/';

        return $uri;
    }


    public function join($relative)
    {
        $uri = new PhpUri($relative);
        switch (true)
        {
            case !empty($uri->scheme):
                break;
            case !empty($uri->authority):
                break;
            case empty($uri->path):
                $uri->path = $this->path;
                if (empty($uri->query))
                {
                    $uri->query = $this->query;
                }
                break;
            case strpos($uri->path, '/')===0:
                break;
            default:
                $base_path = $this->path;
                if (strpos( $base_path, '/' ) === FALSE)
                {
                    $base_path = '';
                }
                else
                {
                    $base_path = preg_replace('/\/[^\/]+$/', '/', $base_path);
                }
                if (empty($base_path) && empty($this->authority))
                {
                    $base_path = '/';
                }
                $uri->path = $base_path.$uri->path;
        }


        if (empty($uri->scheme))
        {
            $uri->scheme = $this->scheme;
            if (empty($uri->authority ))
            {
                $uri->authority = $this->authority;
            }
        }
        return $uri->toStr();
    }



    private function toStr()
    {
        $ret = '';
        if ( !empty( $this->scheme ) )
        {
            $ret .= "{$this->scheme}:";
        }
        if ( !empty( $this->authority ) )
        {
            $ret .= "//{$this->authority}";
        }
        $ret .= $this->normalizePath( $this->path );
        if ( !empty( $this->query ) )
        {
            $ret .= "?{$this->query}";
        }
        if ( !empty( $this->fragment ) )
        {
            $ret .= "#{$this->fragment}";
        }
        return $ret;
    }



    private function normalizePath( $path )
    {
        if ( empty( $path ) )
        {
            return '';
        }
        $normalized_path = $path;
        $normalized_path = preg_replace( '`//+`', '/', $normalized_path, -1, $c0 );
        $normalized_path = preg_replace( '`^/\\.\\.?/`', '/', $normalized_path, -1, $c1 );
        $normalized_path = preg_replace( '`/\\.(/|$)`', '/', $normalized_path, -1, $c2 );
        /**
         * CHANGE:
         * @author Dominik Habichtsberg <[email protected]>
         * @since  24 Mai 2015 10:05 Uhr
         * changed limit form -1 to 1, because climbing up the directory-tree failed
         */
        $normalized_path = preg_replace( '`/[^/]*?/\\.\\.(/|$)`', '/', $normalized_path, 1, $c3 );
        $num_matches     = $c0 + $c1 + $c2 + $c3;
        return ( $num_matches > 0 ) ? $this->normalizePath( $normalized_path ) : $normalized_path;
    }
}

参考

发布了94 篇原创文章 · 获赞 110 · 访问量 80万+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章