我已经写了一个Url抓取代码,在必应和谷歌上工作,并导航良好的网页。
我现在正在尝试将它设置为在duckduckgo.com上工作。我已经让它起作用了,所以它会从一页纸上刮下来。唯一的问题是,我不能锻炼,如何让它显示更多的结果,如何导航。它只从第一页提取结果。
Google和Bing有一个next按钮,代码可以导航,但是我想不出如何实现duckduckgo。我被困在最后一点上了。其余的都很好。搜索结果,要导航的页面和2倍的延迟来自Sheet10
我不需要完全重写代码.我所需要的帮助就是如何浏览这些页面.
Private Sub duckduckgoScraper()
'''DuckDuckGo URL SCRAPER
Dim ie As Object
Dim HTMLdoc As Object
Dim nextPageElement As Object
Dim div As Object
Dim link As Object
Dim url As String
Dim pageNumber As Long
Dim i As Long
Dim myCounter As Long
'''Takes search from Sheet10 to DuckDuckGo
url = "https://duckduckgo.com/?q=" & Replace(Worksheets("Sheet10").Range("G17").Value & Range("H17").Value, " ", "+")
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
.navigate url
Do While .Busy Or .readyState <> 4
DoEvents
Loop
End With
Application.Wait Now + TimeSerial(0, 0, 5)
Set HTMLdoc = ie.document
'''Searches URLS and places them in Sheet called Sheet2 ROW 2 Column A
With Sheets("Sheet2")
pageNumber = 1
i = 2
Do
For Each div In HTMLdoc.getElementsByTagName("div")
If div.getAttribute("class") = "result__body links_main links_deep" Then
Set link = div.getElementsByTagName("a")(0)
.Cells(i, 1).Value = link.getAttribute("href")
i = i + 1
End If
Next div
'''Searches Number of Pages entered in Sheet10
If pageNumber >= Replace(Worksheets("Sheet10").Range("I17").Value, " ", "+") Then Exit Do
On Error Resume Next
'''################################################################################################
'''########################## **I am stuck here, the rest is fine** #############################
'''################################################################################################
Set nextPageElement = HTMLdoc.getElementByClassName("I NEED THIS BIT, I AM STUCK HERE")
If nextPageElement Is Nothing Then Exit Do
'''Scrolls Down the Browser
ie.document.parentWindow.Scroll 0&, 99999
'''Random delay from Max number entered in Sheet10
Application.Wait Now + TimeSerial(0, 0, Application.RandBetween(1, Worksheets("Sheet10").Range("J17").Value))
'''Click the next page
nextPageElement.Click
Do While ie.Busy Or ie.readyState <> 4
DoEvents
Loop
'''Random delay from Max number entered in Sheet10
Application.Wait Now + TimeSerial(0, 0, Application.RandBetween(1, Worksheets("Sheet10").Range("K17").Value))
Set HTMLdoc = ie.document
''' Delete duplicates
Sheet2.Columns("A").RemoveDuplicates Columns:=Array(1), Header:=xlYes
''' Delete Row If Blank
Sheet2.Columns("A").SpecialCells(xlCellTypeBlanks).EntireRow.Delete
pageNumber = pageNumber + 1
myCounter = myCounter + 1
Worksheets("Sheet10").Range("G6").Value = myCounter
Loop
End With
'''Quite browser and clear
ie.Quit
Set ie = Nothing
Set HTMLdoc = Nothing
Set nextPageElement = Nothing
Set div = Nothing
Set link = Nothing
''' To stop the code early, change page number to 0, else code will finish when page number completed
If Sheet10.Range("I17") = 0 Then
Complete.Show
Termination.Hide
ElseIf Sheet10.Range("I17") > 0 Then
Complete.Show
End If
End Sub到目前为止我已经尝试过了,我已经尝试过黄色的部分,但是我无法让它工作。单击next按钮时,rld-1将更改为rdl-2和3。这是我现在唯一坚持的一点。


有人能告诉我。我认为这可能是因为它是JavaScript,但我在编程方面的知识有限,并且已经坚持了几天了。
一如既往,提前谢谢
发布于 2020-05-25 16:35:30
试着点击“更多的结果”按钮,先加载所有的页面。我是用这样的线条做的
Dim objMoreResults As Object, p As Long
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
.Navigate url
Do While .Busy Or .readyState <> 4: DoEvents: Loop
End With
p = 1
backP:
On Error Resume Next
Set objMoreResults = ie.document.getElementById("rld-" & p)
On Error GoTo 0
If Not objMoreResults Is Nothing Then
objMoreResults.getElementsByTagName("a")(0).Click
Set objMoreResults = Nothing: p = p + 1
Application.Wait Now + TimeSerial(0, 0, 3): GoTo backP
End If这是对我来说工作正常的完整代码。在尝试编辑代码行之前,先尝试这段代码,看看这段代码是否有效。
Sub DuckDuckGo_Scraper()
Dim x, ie As Object, objMoreResults As Object, htmlDoc As Object, div As Object, sURL As String, p As Long, i As Long
x = Application.InputBox("Enter The Number Of Pages", , 2)
If Not IsNumeric(x) Then Exit Sub
p = 1: i = 1
sURL = "https://duckduckgo.com/?q=" & Replace(Worksheets("Sheet10").Range("G17").Value & Range("H17").Value, " ", "+")
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
.Navigate sURL
Do While .Busy Or .readyState <> 4: DoEvents: Loop
BackP:
On Error Resume Next
Set objMoreResults = ie.document.getElementById("rld-" & p)
On Error GoTo 0
If Not objMoreResults Is Nothing Then
objMoreResults.getElementsByTagName("a")(0).Click
Set objMoreResults = Nothing: p = p + 1: If p = Val(x) Then GoTo NextP
Application.Wait Now + TimeSerial(0, 0, 3): GoTo BackP
End If
NextP:
Application.Wait Now + TimeSerial(0, 0, 3)
Set htmlDoc = .document
For Each div In htmlDoc.getElementsByClassName("result__title")
i = i + 1
Worksheets("Sheet2").Cells(i, 1).Value = div.getElementsByTagName("a")(0).href
Next div
.Quit
End With
Set ie = Nothing: Set htmlDoc = Nothing: Set div = Nothing
End Sub发布于 2020-05-26 13:50:40
好吧,这不是世界上最好的代码,但它有效。多亏了YasserKhalil,没有他,我不可能做到这一点。我必须对这两种代码进行修改才能让它起作用。再次感谢YasserKhalil
Private Sub duckduckgoScraper()
'''DuckDuckGo URL SCRAPER
Dim ie As Object
Dim HTMLdoc As Object
Dim div As Object
Dim link As Object
Dim url As String
Dim i As Long
Dim myCounter As Long
Dim objMoreResults As Object
Dim p As Long
'''Takes seach from Sheet10 to google
url = "https://duckduckgo.com/?q=" & Replace(Worksheets("Sheet10").Range("G17").Value & Range("H17").Value, " ", "+")
On Error Resume Next
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
.navigate url
Do While .Busy Or .readyState <> 4
DoEvents
Loop
End With
Application.Wait Now + TimeSerial(0, 0, 5)
Set HTMLdoc = ie.document
'''Searches URLS and places them in Sheet called Sheet2 ROW 2 Column A
With Sheets("Sheet2")
pageNumber = 1
i = 2
Do
p = 1
backP:
Set objMoreResults = ie.document.getElementById("rld-" & p)
On Error GoTo 0
''' End do in NO MORE results
If objMoreResults Is Nothing Then Exit Do
'''If objMoreResults not same as pages requested on sheet10 I17
If objMoreResults <> Sheet10.Range("I17").Value Then
objMoreResults.getElementsByTagName("a")(0).Click
Set objMoreResults = Nothing: p = p + 1
'''Scrolls Down the Browser
ie.document.parentWindow.Scroll 0&, 99999
'''Random delay from Max number entered in Sheet10
Application.Wait Now + TimeSerial(0, 0, Application.RandBetween(1, Worksheets("Sheet10").Range("J17").Value))
' nextPageElement.Click 'next web page
Do While ie.Busy Or ie.readyState <> 4
DoEvents
Loop
'''Random delay from Max number entered in Sheet10
Application.Wait Now + TimeSerial(0, 0, Application.RandBetween(1, Worksheets("Sheet10").Range("K17").Value))
Set HTMLdoc = ie.document
On Error Resume Next
''' extract urls
For Each div In HTMLdoc.getElementsByTagName("div")
If div.getAttribute("class") = "result__body links_main links_deep" Then
'If div.getAttribute("class") = "result__a" Then
Set link = div.getElementsByTagName("a")(0)
.Cells(i, 1).Value = link.getAttribute("href")
i = i + 1
End If
Next div
'''' Delete duplicates
Sheet2.Columns("A").RemoveDuplicates Columns:=Array(1), Header:=xlYes
'''' Delete Row If Blank
Sheet2.Columns("A").SpecialCells(xlCellTypeBlanks).EntireRow.Delete
'pageNumber = pageNumber + 1
''' Loop Counter
myCounter = myCounter + 1
Worksheets("Sheet10").Range("G6").Value = myCounter
''' If same as Sheet10 G6 then end
If Sheet10.Range("G6").Value = Sheet10.Range("I17").Value Then Exit Do
Application.Wait Now + TimeSerial(0, 0, 3):
GoTo backP
End If
Loop
End With
ie.Quit
Set ie = Nothing
Set HTMLdoc = Nothing
Set objMoreResults = Nothing
Set div = Nothing
Set link = Nothing
If Sheet10.Range("I17") = 0 Then
Complete.Show
Termination.Hide
ElseIf Sheet10.Range("I17") > 0 Then
Complete.Show
End If
End Subhttps://stackoverflow.com/questions/62006067
复制相似问题