(golang)HTTP基本認證機制及使用gocolly登錄爬取

內網有個網頁用了HTTP基本認證機制,想用gocolly爬取,不知道怎麼登錄,只好研究HTTP基本認證機制

參考這裏:https://www.jb51.net/article/89070.htm  

下面開始參考作者dotcoo了:-)

看了<<http權威指南>>第12章HTTP基本認證機制(本站下載地址://www.jb51.net/books/93254.html),感覺講的蠻詳細的,寫了一個小小例子測試.

請求響應過程:

==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
<==
HTTP/1.1 401 Unauthorized
WWW-Authenticate: Basic realm="Dotcoo User Login"
==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
Authorization: Basic YWRtaW46YWRtaW5wd2Q=
<==
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8

golang HTTP基本認證機制的實現代碼

package main
import (
    "fmt"
    "io"
    "net/http"
    "log"
    "encoding/base64"
    "strings"
)
// hello world, the web server
func HelloServer(w http.ResponseWriter, req *http.Request) {
    auth := req.Header.Get("Authorization")
    if auth == "" {
        w.Header().Set("WWW-Authenticate", `Basic realm="Dotcoo User Login"`)
        w.WriteHeader(http.StatusUnauthorized)
        return
    }
    fmt.Println(auth)
    auths := strings.SplitN(auth, " ", 2)
    if len(auths) != 2 {
        fmt.Println("error")
        return
    }
    authMethod := auths[0]
    authB64 := auths[1]
    switch authMethod {
    case "Basic":
        authstr, err := base64.StdEncoding.DecodeString(authB64)
        if err != nil {
            fmt.Println(err)
            io.WriteString(w, "Unauthorized!\n")
            return
        }
        fmt.Println(string(authstr))
        userPwd := strings.SplitN(string(authstr), ":", 2)
        if len(userPwd) != 2 {
            fmt.Println("error")
            return
        }
        username := userPwd[0]
        password := userPwd[1]
        fmt.Println("Username:", username)
        fmt.Println("Password:", password)
        fmt.Println()
    default:
        fmt.Println("error")
        return
    }
    io.WriteString(w, "hello, world!\n")
}
func main() {
    http.HandleFunc("/hello", HelloServer)
    err := http.ListenAndServe(":8000", nil)
    if err != nil {
        log.Fatal("ListenAndServe: ", err)
    }
}

試驗了上面的例子後,基本明白了HTTP基本認證的過程。但是怎麼用gocolly訪問呢?

參考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account

但是答覆者Matías Insaurralde提供的模擬瀏覽器訪問的例子編譯不通過,不明白其中的hptsKey的意思。代碼放在下面供參考(可跳過):

package evernote

import (
    "bytes"
    "errors"
    "fmt"
    "io/ioutil"
    "net/http"
    "net/http/cookiejar"
    "net/url"
    "regexp"
    "strings"
)

const (
    evernoteLoginURL = "https://www.evernote.com/Login.action"
)

var (
    evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById\("(.*)"\).value = "(.*)"`)
    evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`)

    errNoMatches   = errors.New("No matches")
    errRedirectURL = errors.New("Redirect URL not found")
)

// EvernoteClient wraps all methods required to interact with the website.
type EvernoteClient struct {
    Username   string
    Password   string
    httpClient *http.Client

    // These parameters persist during the login process:
    hpts  string
    hptsh string
}

// NewEvernoteClient initializes a new Evernote client.
func NewEvernoteClient(username, password string) *EvernoteClient {
    // Allocate a new cookie jar to mimic the browser behavior:
    cookieJar, _ := cookiejar.New(nil)

    // Fill up basic data:
    c := &EvernoteClient{
        Username: username,
        Password: password,
    }

    // When initializing the http.Client, copy default values from http.DefaultClient
    // Pass a pointer to the cookie jar that was created earlier:
    c.httpClient = &http.Client{
        Transport:     http.DefaultTransport,
        CheckRedirect: http.DefaultClient.CheckRedirect,
        Jar:           cookieJar,
        Timeout:       http.DefaultClient.Timeout,
    }
    return c
}

func (e *EvernoteClient) extractJSParams(body []byte) (err error) {
    matches := evernoteJSParamsExpr.FindAllSubmatch(body, -1)
    if len(matches) == 0 {
        return errNoMatches
    }
    for _, submatches := range matches {
        if len(submatches) < 3 {
            err = errNoMatches
            break
        }
        key := submatches[1]
        val := submatches[2]

        if bytes.Compare(key, hptsKey) == 0 {
            e.hpts = string(val)
        }
        if bytes.Compare(key, hptshKey) == 0 {
            e.hptsh = string(val)
        }
    }
    return nil
}

// Login handles the login action.
func (e *EvernoteClient) Login() error {
    // First step: fetch the login page as a browser visitor would do:
    res, err := e.httpClient.Get(evernoteLoginURL)
    if err != nil {
        return err
    }
    if res.Body == nil {
        return errors.New("No response body")
    }
    body, err := ioutil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    err = e.extractJSParams(body)
    if err != nil {
        return err
    }

    // Second step: we have extracted the "hpts" and "hptsh" parameters
    // We send a request using only the username and setting "evaluateUsername":
    values := &url.Values{}
    values.Set("username", e.Username)
    values.Set("evaluateUsername", "")
    values.Set("analyticsLoginOrigin", "login_action")
    values.Set("clipperFlow", "false")
    values.Set("showSwitchService", "true")
    values.Set("hpts", e.hpts)
    values.Set("hptsh", e.hptsh)

    rawValues := values.Encode()
    req, err := http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))
    if err != nil {
        return err
    }
    req.Header.Set("Accept", "application/json")
    req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
    req.Header.Set("x-requested-with", "XMLHttpRequest")
    req.Header.Set("referer", evernoteLoginURL)
    res, err = e.httpClient.Do(req)
    if err != nil {
        return err
    }
    body, err = ioutil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    bodyStr := string(body)
    if !strings.Contains(bodyStr, `"usePasswordAuth":true`) {
        return errors.New("Password auth not enabled")
    }

    // Third step: do the final request, append password to form data:
    values.Del("evaluateUsername")
    values.Set("password", e.Password)
    values.Set("login", "Sign in")

    rawValues = values.Encode()
    req, err = http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))
    if err != nil {
        return err
    }
    req.Header.Set("Accept", "text/html")
    req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
    req.Header.Set("x-requested-with", "XMLHttpRequest")
    req.Header.Set("referer", evernoteLoginURL)
    res, err = e.httpClient.Do(req)
    if err != nil {
        return err
    }

    // Check the body in order to find the redirect URL:
    body, err = ioutil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    bodyStr = string(body)
    matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr, -1)
    if len(matches) == 0 {
        return errRedirectURL
    }
    m := matches[0]
    if len(m) < 2 {
        return errRedirectURL
    }
    redirectURL := m[1]
    fmt.Println("Login is ok, redirect URL:", redirectURL)
    return nil
}
After you successfully get the redirect URL, you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process, the cookie jar plays a very important role here.

To call this code use:

func main() {
    evernoteClient := NewEvernoteClient("user@company", "password")
    err := evernoteClient.Login()
    if err != nil {
        panic(err)
    }
}

只好自己寫,經反覆試驗,發現對於本文開頭自己寫的server,只需以下代碼即可通過驗證,輸出了hello,world!(將訪問方式改爲POST也一樣。)

package main

import (
    "fmt"

    "io/ioutil"
    "net/http"
)

// Login handles the login action.
func Login() {
    //生成client 參數爲默認
    client := &http.Client{}
    //要訪問的url
    url := "http://localhost:8000/hello"
    //要提交的請求
    req, _ := http.NewRequest("GET", url, nil)
    //最重要的一句,用戶名和密碼可隨意寫
    req.SetBasicAuth("aa", "bb")
    fmt.Println("POST訪問")
    //返回結果
    res, _ := client.Do(req)
    defer res.Body.Close()
    fmt.Println("header:")
    header := res.Header
    fmt.Println(header)
    fmt.Println("realm:")
    basicRealm := res.Header.Get("Www-Authenticate")
    fmt.Println(basicRealm)
    fmt.Println("body:")
    body, _ := ioutil.ReadAll(res.Body)
    fmt.Println(string(body))

}

func main() {   
    Login()  
}

查看SetBasicAuth的定義爲(liteide中在光標位置按Ctrl+shift+J):

func (r *Request) SetBasicAuth(username, password string) {
    r.Header.Set("Authorization", "Basic "+basicAuth(username, password))
}

而basicAuth的定義爲

func basicAuth(username, password string) string {
    auth := username + ":" + password
    return base64.StdEncoding.EncodeToString([]byte(auth))
}

那麼,用gocolly訪問的代碼如下:

package main

import (
    "encoding/base64"
    "fmt"
    "net/http"

    "github.com/gocolly/colly"
)

func basicAuth(username, password string) string {
    auth := username + ":" + password
    return base64.StdEncoding.EncodeToString([]byte(auth))
}
func main() {
    c := colly.NewCollector()
    h := http.Header{}
    h.Set("Authorization", "Basic "+basicAuth("aaaa", "bbbb"))

    c.OnResponse(func(r *colly.Response) {
        //fmt.Println(r)
        fmt.Println(string(r.Body))
    })

    c.Request("GET", "http://localhost:8000/hello", nil, nil, h)
}

注:對於其他網站,也許要用Fiddler抓包,設置相應的header和cookie纔行。

本文轉自:https://www.cnblogs.com/pu369/p/10408898.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章