内网有个网页用了HTTP基本认证机制,想用gocolly爬取,不知道怎么登录,只好研究HTTP基本认证机制
参考这里:https://www.jb51.net/article/89070.htm
下面开始参考作者dotcoo了:-)
看了<<http权威指南>>第12章HTTP基本认证机制(本站下载地址://www.jb51.net/books/93254.html),感觉讲的蛮详细的,写了一个小小例子测试.
请求响应过程:
==> GET /hello HTTP/1.1 Host: 127.0.0.1:12345 <== HTTP/1.1 401 Unauthorized WWW-Authenticate: Basic realm="Dotcoo User Login" ==> GET /hello HTTP/1.1 Host: 127.0.0.1:12345 Authorization: Basic YWRtaW46YWRtaW5wd2Q= <== HTTP/1.1 200 OK Content-Type: text/plain; charset=utf-8
golang HTTP基本认证机制的实现代码
package main import ( "fmt" "io" "net/http" "log" "encoding/base64" "strings" ) // hello world,the web server func HelloServer(w http.ResponseWriter,req *http.Request) { auth := req.Header.Get("Authorization") if auth == "" { w.Header().Set("WWW-Authenticate",`Basic realm="Dotcoo User Login"`) w.WriteHeader(http.StatusUnauthorized) return } fmt.Println(auth) auths := strings.SplitN(auth," ",2) if len(auths) != 2 { fmt.Println("error") return } authMethod := auths[0] authB64 := auths[1] switch authMethod { case "Basic": authstr,err := base64.StdEncoding.DecodeString(authB64) if err != nil { fmt.Println(err) io.WriteString(w,"Unauthorized!\n") return } fmt.Println(string(authstr)) userPwd := strings.SplitN(string(authstr),":",2) if len(userPwd) != 2 { fmt.Println("error") return } username := userPwd[0] password := userPwd[1] fmt.Println("Username:",username) fmt.Println("Password:",password) fmt.Println() default: fmt.Println("error") return } io.WriteString(w,"hello,world!\n") } func main() { http.HandleFunc("/hello",HelloServer) err := http.ListenAndServe(":8000",nil) if err != nil { log.Fatal("ListenAndServe: ",err) } }
试验了上面的例子后,基本明白了HTTP基本认证的过程。但是怎么用gocolly访问呢?
参考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account
但是答复者Matías Insaurralde提供的模拟浏览器访问的例子编译不通过,不明白其中的hptsKey的意思。代码放在下面供参考(可跳过):
package evernote import ( "bytes" "errors" "fmt" "io/IoUtil" "net/http" "net/http/cookiejar" "net/url" "regexp" "strings" ) const ( evernoteLoginURL = "https://www.evernote.com/Login.action" ) var ( evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById\("(.*)"\).value = "(.*)"`) evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`) errNoMatches = errors.New("No matches") errRedirectURL = errors.New("Redirect URL not found") ) // EvernoteClient wraps all methods required to interact with the website. type EvernoteClient struct { Username string Password string httpClient *http.Client // These parameters persist during the login process: hpts string hptsh string } // NewEvernoteClient initializes a new Evernote client. func NewEvernoteClient(username,password string) *EvernoteClient { // Allocate a new cookie jar to mimic the browser behavior: cookieJar,_ := cookiejar.New(nil) // Fill up basic data: c := &EvernoteClient{ Username: username,Password: password,} // When initializing the http.Client,copy default values from http.DefaultClient // Pass a pointer to the cookie jar that was created earlier: c.httpClient = &http.Client{ Transport: http.DefaultTransport,CheckRedirect: http.DefaultClient.CheckRedirect,Jar: cookieJar,Timeout: http.DefaultClient.Timeout,} return c } func (e *EvernoteClient) extractJSParams(body []byte) (err error) { matches := evernoteJSParamsExpr.FindAllSubmatch(body,-1) if len(matches) == 0 { return errNoMatches } for _,submatches := range matches { if len(submatches) < 3 { err = errNoMatches break } key := submatches[1] val := submatches[2] if bytes.Compare(key,hptsKey) == 0 { e.hpts = string(val) } if bytes.Compare(key,hptshKey) == 0 { e.hptsh = string(val) } } return nil } // Login handles the login action. func (e *EvernoteClient) Login() error { // First step: fetch the login page as a browser visitor would do: res,err := e.httpClient.Get(evernoteLoginURL) if err != nil { return err } if res.Body == nil { return errors.New("No response body") } body,err := IoUtil.ReadAll(res.Body) if err != nil { return err } err = e.extractJSParams(body) if err != nil { return err } // Second step: we have extracted the "hpts" and "hptsh" parameters // We send a request using only the username and setting "evaluateUsername": values := &url.Values{} values.Set("username",e.Username) values.Set("evaluateUsername","") values.Set("analyticsloginorigin","login_action") values.Set("clipperFlow","false") values.Set("showSwitchService","true") values.Set("hpts",e.hpts) values.Set("hptsh",e.hptsh) rawValues := values.Encode() req,err := http.NewRequest(http.MethodPost,evernoteLoginURL,bytes.NewBufferString(rawValues)) if err != nil { return err } req.Header.Set("Accept","application/json") req.Header.Set("Content-Type","application/x-www-form-urlencoded; charset=UTF-8") req.Header.Set("x-requested-with","XMLHttpRequest") req.Header.Set("referer",evernoteLoginURL) res,err = e.httpClient.Do(req) if err != nil { return err } body,err = IoUtil.ReadAll(res.Body) if err != nil { return err } bodyStr := string(body) if !strings.Contains(bodyStr,`"usePasswordAuth":true`) { return errors.New("Password auth not enabled") } // Third step: do the final request,append password to form data: values.Del("evaluateUsername") values.Set("password",e.Password) values.Set("login","Sign in") rawValues = values.Encode() req,err = http.NewRequest(http.MethodPost,"text/html") req.Header.Set("Content-Type",err = e.httpClient.Do(req) if err != nil { return err } // Check the body in order to find the redirect URL: body,err = IoUtil.ReadAll(res.Body) if err != nil { return err } bodyStr = string(body) matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr,-1) if len(matches) == 0 { return errRedirectURL } m := matches[0] if len(m) < 2 { return errRedirectURL } redirectURL := m[1] fmt.Println("Login is ok,redirect URL:",redirectURL) return nil } After you successfully get the redirect URL,you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process,the cookie jar plays a very important role here. To call this code use: func main() { evernoteClient := NewEvernoteClient("[email protected]","password") err := evernoteClient.Login() if err != nil { panic(err) } }
只好自己写,经反复试验,发现对于本文开头自己写的server,只需以下代码即可通过验证,输出了hello,world!(将访问方式改为POST也一样。)
package main import ( "fmt" "io/IoUtil" "net/http" ) // Login handles the login action. func Login() { //生成client 参数为默认 client := &http.Client{} //要访问的url url := "http://localhost:8000/hello" //要提交的请求 req,_ := http.NewRequest("GET",url,nil) //最重要的一句,用户名和密码可随意写 req.SetBasicAuth("aa","bb") fmt.Println("POST访问") //返回结果 res,_ := client.Do(req) defer res.Body.Close() fmt.Println("header:") header := res.Header fmt.Println(header) fmt.Println("realm:") basicRealm := res.Header.Get("Www-Authenticate") fmt.Println(basicRealm) fmt.Println("body:") body,_ := IoUtil.ReadAll(res.Body) fmt.Println(string(body)) } func main() { Login() }
查看SetBasicAuth的定义为(liteide中在光标位置按Ctrl+shift+J):
func (r *Request) SetBasicAuth(username,password string) { r.Header.Set("Authorization","Basic "+basicAuth(username,password)) }
而basicAuth的定义为
func basicAuth(username,password string) string { auth := username + ":" + password return base64.StdEncoding.EncodetoString([]byte(auth)) }
那么,用gocolly访问的代码如下:
package main import ( "encoding/base64" "fmt" "net/http" "github.com/gocolly/colly" ) func basicAuth(username,password string) string { auth := username + ":" + password return base64.StdEncoding.EncodetoString([]byte(auth)) } func main() { c := colly.NewCollector() h := http.Header{} h.Set("Authorization","Basic "+basicAuth("aaaa","bbbb")) c.OnResponse(func(r *colly.Response) { //fmt.Println(r) fmt.Println(string(r.Body)) }) c.Request("GET","http://localhost:8000/hello",nil,h) }
注:对于其他网站,也许要用fiddler抓包,设置相应的header和cookie才行。