所以我的代码可以工作,但只适用于一个url。(例如,我使用了http://www.ancient-hebrew.org/m/dictionary/1000.html)
然而,我想将每个单独的html文件URL应用到我的代码中。可在此处找到(https://www.ancient-hebrew.org/m/dictionary/)。
from bs4 import BeautifulSoup
import re
import urllib
def getImage(_list):
images = []
# adds the url
for image in _list:
images.append(re.sub(
r"..\/..\/", r"http://www.ancient-hebrew.org/", image['src']))
return images
def getAudioFile(_list):
audio = []
# removes a tab character + adds the url
for l in _list:
audio.append("http://www.ancient-hebrew.org/m/dictionary/" +
l['href'].replace("\t", ''))
return ''.join(audio)
def getHebrewWord(_list):
hebrew = []
for f in _list:
hebrew.append(f.string.strip())
return ''.join(hebrew)
url = 'http://www.ancient-hebrew.org/m/dictionary/1000.html'
file_name = str(re.search(r'(\d+).\w+$', url).group(1)) + ".txt"
raw_html = urllib.urlopen(url).readlines()
_list = []
_dict = {}
_ignore = {'audioURLs': '', 'pronuncation': [],
'imageURLs': [], 'hebrewWord': ''}
for line in raw_html:
number = 1
html = BeautifulSoup(line, 'lxml')
# Image Files URLs
images = getImage(html.find_all('img', src=re.compile('.jpg$')))
# Audio File URLs
audioFile = getAudioFile(html.find_all('a', href=re.compile('.mp3$')))
# Hebrew Words
hebrewWords = getHebrewWord(html.find_all('font', face="arial", size="+1"))
# Pronunciations
pronunciation = [item.next_sibling.strip()
for item in html.select('img + font')]
# Output: {'audioURLs': '', 'pronuncation': [], 'imageURLs': [], 'hebrewWord': ''}
dictionary = {
'audioURLs': audioFile,
'pronuncation': pronunciation,
'imageURLs': images,
'hebrewWord': hebrewWords
}
if dictionary != _ignore:
_list.append(dictionary)
with open(file_name, 'w') as f:
for item in _list:
f.write("%s\n" % item)所以最后,我想把它们写到尽可能多的文件中。有什么简单的方法可以做到这一点。
发布于 2019-06-12 05:03:06
在我看来,你把它变得有点不必要地复杂了(而且--大错特错!--在html D:上使用了regex )。我试图简化部分内容--获取图像和声音的链接,并将它们插入列表中。请注意,由于各种原因,我更改了您使用的一些变量名-但将所有内容都放入您的结构中并对其进行扩展以获得单词本身应该是相对容易的:
from bs4 import BeautifulSoup as bs
import requests
url = 'http://www.ancient-hebrew.org/m/dictionary/1000.html'
raw_html = requests.get(url)
soup = bs(raw_html.content, 'lxml')
image_list = []
audio_list = []
images = soup.find_all ('img')
audios = soup.find_all ('a',href=True)
for image in images:
if 'jpg' in image['src']:
image_link = "http://www.ancient-hebrew.org/"+image['src'].replace('../../','')
image_list.append(image_link)
for audio in audios:
if 'mp3' in audio['href']:
audio_link = "http://www.ancient-hebrew.org/m/dictionary/"+audio['href'].replace("\t", '')
audio_list.append(link)等。
https://stackoverflow.com/questions/56550723
复制相似问题