我正试图为一个特定的网站建立一个网络爬虫。但出于某种原因我不会连接到这个网站。我犯了一个错误,它不能连接。用selesium打电话给网站,我看到它没有连接
作为一个新手,我可能犯了一个愚蠢的错误,但我想不出是什么。希望你愿意帮助我。
import csv
import requests
import datetime
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')
# def get_driver():
# driver = webdriver.Chrome()
# return driver
def get_driver():
# initialize options
options = webdriver.ChromeOptions()
# pass in headless argument to options
options.add_argument('--headless')
# initialize driver
driver = webdriver.Chrome(chrome_options=options)
return driver
def connect_to_base(browser, page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
html = None
links = None
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
#wait for table element with id = 'map' to load
#before returning True
WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
return False
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
inside = soup.find_all('a', {'class':'property-inner'},{'href'})
# Make empty lists with header lines
output_list = []
listing = 1
for items in inside:
href = items.get('href')
url1 = href.format(page)
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(url1)
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, 'detail-address')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
details = BeautifulSoup(browser.page_source, 'html')
adres = details.find_all ('div', {'class':'detail-address'})
for adresinfo in adres:
try:
adres = adres[0].get_text(separator=',', strip=True)
except Indexerror:
adres = "Unknown"
kenmerken = details.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr_kenmerken = ','.join([td.text.strip() for td in kenmerken[0].select('td.value')])
except IndexError:
tr_kenmerken = 'Unknown'
waarde = details.find_all ('div', {'class':'detail-tab-content woningwaarde'})
try:
tr_waarde = ','.join([td.text.strip() for td in waarde[0].select('td.value')])
except IndexError:
tr_waarde = 'Unknown'
informatie = {
'adres': adres,
'kenmerken': tr_kenmerken,
'waarde': tr_waarde,
'url': href
}
output_list.append(informatie)
listing += 1
return output_list
def get_load_time(article_url):
try:
# set headers
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as ex:
load_time = 'Loading Error'
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(filename, 'a') as csvfile:
fieldnames = ['adres', 'kenmerken', 'waarde', 'link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
def run_process(page_number, filename, browser):
if connect_to_base(browser, page_number):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print('Error connecting to jaap')
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'output_{output_timestamp}.csv'
browser = get_driver()
# scrape and crawl
while current_page <= 3:
print(f'Scraping page #{current_page}...')
run_process(current_page, output_filename, browser)
current_page = current_page + 1
# exit
browser.quit()
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')发布于 2020-01-06 13:46:49
我看到你把EC.presence_of_element_located((By.ID,{'class':'result-content'}))变成了EC.presence_of_element_located((By.CLASS_NAME,'result-content')))
接下来,您可能会遇到一个问题(取决于打开浏览器的位置),您必须绕过/单击一个javascript,该javascript表示您很好并接受cookie。
但是,考虑到数据作为json格式存储在来自html的script标记中,所有这些代码似乎都是一项艰巨的工作。为什么不简单地使用requests,取出json,转换成dataframe,然后写到csv?
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
def run_process(page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')输出:
Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds和csv文件,它看起来像:
app area detailsUrl expired houseTypeValue id latLng latLng.latitude latLng.longitude location.city location.street location.zipcode lotSize market numberOfRooms openHouseDate openHouseTimes openhouse photo price priceToShow showoffColor showoffCustomText showoffPhotoText spotlight status veiling
0 False 165 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 6899666 NaN 52.368420 4.833631 AMSTERDAM Hof van Versailles 61 1064NX 216 sale 4 None None False 10014EAAF8B8883668593EFAC9E5FF1C 595000.0 595000.0 None None None False Sale False
1 True 211 /te-koop/noord+holland/groot-amsterdam/amsterd... False Appartement 10585731 NaN 52.327550 4.889076 AMSTERDAM Beysterveld 35 1083KA Onbekend sale 4 None None False E4F9E5BC7BC90B5B92C7BD8D48B7A677 925000.0 925000.0 None None None False Sale False
2 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Dubbele bovenwoning 11731386 NaN 52.341890 4.896053 AMSTERDAM Uiterwaardenstraat 320 2 1079DC Onbekend sale 5 None None False AB9F45B2CD4AD7879C5A80F18092F9D4 750000.0 750000.0 None None None False SoldConditionally False
3 False 269 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 11840681 NaN 52.358266 4.875508 AMSTERDAM Korte van Eeghenstraat 4 1071ER 107 sale 9 None None False A3DF2B1D426B5E4D501503C5D0E66966 3100000.0 3100000.0 None None None False Sale False
4 False 100 /te-koop/noord+holland/groot-amsterdam/amsterd... False Tussenwoning 12152943 NaN 52.421245 4.899478 AMSTERDAM Pieter A v Heijningestraat 9 1035SV 83 sale 5 None None False 55C6F589523FA553D67A709776DD70DD 399000.0 399000.0 None None None False Sale False
5 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Bovenwoning 15796874 NaN NaN NaN AMSTERDAM Eerste Amstelvlietpad 20 1096GB Onbekend sale 3 None None False AE822B627ED096310B9ECBE7756340C8 1200000.0 1200000.0 None None None False Sale False
6 True 76 /te-koop/noord+holland/groot-amsterdam/amsterd... False Benedenwoning 10580650 NaN 52.346010 4.888799 AMSTERDAM Grevelingenstraat 18 HS 1078KP Onbekend sale 2 None None False 6FD1011D917E776DCF4DA836B5FFEE3E 550000.0 550000.0 None None None False SoldConditionally False
7 False 298 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9623182 NaN 52.330610 4.862902 AMSTERDAM Cannenburg 51 1081GW 651 sale 7 None None False 15FA170B99D4E2DEA03B6FC27E3B5B74 2495000.0 2495000.0 None None None False Sale False
8 False 270 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 15791215 NaN 52.347780 5.004530 AMSTERDAM Nico Jessekade 189 1087MR 200 sale 9 None None False 6EA5C0CDA0475DFC88A3A918A6B2909A 1549000.0 1549000.0 None None None False SoldConditionally False
9 False 201 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9617942 NaN 52.377391 4.764554 AMSTERDAM Osdorperweg 803 1067SW 1348 sale 6 None None False 4680429D99EC5AC47C950D57A77DF1EB 950000.0 950000.0 None None None False Sale False更新:
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
def run_process(page_number):
page_number = 1
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
root_URL = 'https://jaap.nl'
df['detailsUrl'] = root_URL + df['detailsUrl']
allPropDetails = pd.DataFrame()
for idx, row in df.iterrows():
propDetails = pd.DataFrame(index=[0])
w=1
detailLink = row['detailsUrl']
print ('Scraping: %s' %(row['location.street']))
dfs = pd.read_html(detailLink)
for each in dfs:
#each = dfs[8]
w=1
if each.isnull().all().all():
continue
each = each.dropna(axis=0, how='all')
specialCase = False
for col in list(each.columns):
if each[col].dtypes == 'object':
if each[col].str.contains('Voorziening').any():
specialCase = True
break
if specialCase == True:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
cols1 = list(each.iloc[2:,0])
each = each.iloc[2:,:]
each[1] = each[1] + '---' + each[2]
each = each.iloc[:,-2]
each.index = cols1
each = each.to_frame().T
propRow = each
propRow.index = [0]
temp_df = pd.DataFrame(index=[0])
for col in propRow.columns:
temp_df = temp_df.merge(propRow[col].str.split('---', expand=True).rename(columns={0:col, 1:col+'.distance'}),left_index=True, right_index=True )
propRow = temp_df
else:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
temp_df = each.T
cols = [ temp_df.index[0] + '_' + colName for colName in list(temp_df.iloc[0,:]) ]
propRow = temp_df.iloc[-1,:]
propRow.index = cols
propRow = propRow.to_frame().T
propRow.index = [0]
propDetails = propDetails.merge(propRow, left_index=True, right_index=True)
propDetails.index = [idx]
allPropDetails = allPropDetails.append(propDetails, sort=True)
df = df.merge(allPropDetails, how = 'left', left_index=True, right_index=True)
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')https://stackoverflow.com/questions/59611946
复制相似问题