将 Web 浏览器自动化转换为 XmlHTTP 请求

问题描述

我创建了一个宏,可以从 Brief profiles (BP) 中抓取相关信息,可以在以下位置搜索https://echa.europa.eu/information-on-chemicals

这可以使用对 Brief Profile 的 URL 的 XMLHTTP 请求工作,并且工作正常。

我现在希望创建一个宏来搜索同一网站以查找简要资料的 URL(href)。

作为 VBA 的初学者,我已经使用浏览器成功实现了这一点,但我希望将其转换为 XML HTTP 请求以提高效率。

使用 IE 浏览器自动化:

Sub Gethref()

Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
IE.Visible = True
IE.navigate "https://echa.europa.eu/information-on-chemicals"

'Check Website is ready for search and set HTMLDoc to IE.Document for elements
do while IE.readyState <> READYSTATE_COMPLETE
Loop

Set HTMLDoc = IE.document

'Set value of SearchBox to keyword
Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
HTMLInput.Value = "Potassium mercaptoacetate"

'Search for Result
Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
HTMLButton.Click

'Check page has loaded
do while IE.readyState = READYSTATE_COMPLETE or IE.Busy
Loop
Set HTMLDoc = IE.document

'Find Desired href
Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
Debug.Print HTMLhref.getAttribute("href")

End Sub

这应该将 Potassium mercaptoacetate 的 href 打印为 https://echa.europa.eu/brief-profile/-/briefprofile/100.000.602

我已经开始尝试尽可能多地使用 XML HTTP 进行转换,但遇到了我不太了解的问题

使用 XML HTTP 请求(无效)

Sub Gethref()

    Dim XMLPage As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLInput As MSHTML.IHTMLElement
    Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
    XMLPage.Open "GET","https://echa.europa.eu/information-on-chemicals",False
    XMLPage.send
    
'Set value of SearchBox to keyword
    Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
    HTMLInput.Value

'Search for Result
    Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
    HTMLButton.Click

'Check page has loaded
HTMLDoc.body.innerHTML = IE.document.responseText

'Find Desired href
    Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
    Debug.Print HTMLhref.getAttribute("href")

End Sub

我会在取得进展时更新,但如果有人能提供帮助,那就太好了。

解决方法

好的,应该可以了。事实证明,您需要使用适当的参数发出 post http 请求,以获得包含所需链接的所需响应。

Public Sub GetContent()
    Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
    Dim oHttp As Object,oHtml As HTMLDocument,MyDict As Object,I&,R&
    Dim DictKey As Variant,payload$,searchKeyword$,Ws As Worksheet
    
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set MyDict = CreateObject("Scripting.Dictionary")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")

    searchKeyword = "Acetone"
    
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
    MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
    MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"

    payload = ""
    For Each DictKey In MyDict
        payload = IIf(Len(DictKey) = 0,WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)),_
        payload & "&" & WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)))
    Next DictKey
    
    With oHttp
        .Open "POST",Url,False
        .setRequestHeader "User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type","application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
    
    With oHtml.querySelectorAll("table.table > tbody > tr > td > a.substanceNameLink")
        For I = 0 To .Length - 1
            R = R + 1: Ws.Cells(R,1) = .item(I).getAttribute("href")
        Next I
    End With
End Sub

如果您只对第一个链接感兴趣,请尝试以下而不是最后一个带块的链接:

MsgBox oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")

或者您可以直接从开发工具中复制这些参数并使用它们:

Public Sub GetContent()
    Const Url = "https://echa.europa.eu/search-for-chemicals?"
    Dim oHttp As Object,oHtml As HTMLDocument
    Dim payload$,Ws As Worksheet,urlSuffix$
    
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
    urlSuffix = "p_auth=69hDou3E&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=" & _
                "_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=" & _
                "doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals" & _
                "%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview" & _
                "%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"

    payload = "_disssimplesearchhomepage_WAR_disssearchportlet_formDate=1621042609544&_disssimplesearch_WAR_disssearchportlet_searchOccurred=" & _
              "true&_disssimplesearch_WAR_disssearchportlet_sskeywordKey=Acetone&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer" & _
              "=true&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox=on"
    
    With oHttp
        .Open "POST",Url & urlSuffix,"application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
    
    Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
End Sub