我有一个python脚本,它可以端到端地运行,并为Kodi EPG生成看起来有效的XMLTV指南数据(尽管这是我第一次尝试创建这样的数据):
Tv_guide类:
def __init__(self, start, end, args, kwargs):
self.start = 1
self.end = 3000
global i, channel_list, url_list, date_check, image_list, super_list, epg_list
channel_list = []
url_list = []
date_check = []
image_list = []
super_list = []
epg_list = []
for i in range(66, 67):
global div_list
div_list = []
time.sleep(1)
def session_setup():
global r, html
SelectProxy.select_proxy()
local_proxy = SelectProxy.global_proxy
session = requests.Session()
session.proxies = {local_proxy}
url = ['http://www.tvguide.co.uk/mobile/channellisting.asp?ch=', str(i)]
url = ''.join(url)
if url not in url_list:
url_list.append(url)
headers ={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.',
'Connection': 'keep-alive',
'Host': 'www.tvguide.co.uk',
'Referer': 'http://www.tvguide.co.uk/mobile/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
r = session.get(url, headers=headers)
html = r.content
session_setup()
def soup_scanner():
global soup
global channel, image4, div_list2
time.sleep(1)
soup = BeautifulSoup(r.text, "html.parser")
try:
channel = soup.find('h1').text
for div in soup.select('div'):
if not div.script:
for image in div.find_all('img'):
image2 = re.search(r' src="\S+?"', str(image))
image2 = image2.group(0)
image3 = image2.split('src="')[1]
image4a = image3.split('"')[0]
div_list.append(image4a)
div_list2 = ''
div_list2 = div_list[3]
except Exception as exc:
pass
soup_scanner()
def soup_parser():
global joined_list, joined_list2, date, time_list, desc_list, rating_list, date_list
rolling_date = soup.find_all('h2')
joined_list2 = []
for x in soup.select('table'):
time_list = []
desc_list = []
rating_list = []
date_list = []
joined_list = []
xdate = rolling_date[0].text
rolling_date.pop(0)
for tr in x.select('tr'):
if not tr.script:
for td in tr.find_all('td'):
date_list.append(xdate)
a = ''.join(re.sub(r'\s+', ' ', td.text))
b = a.strip()
if b[:1] in '0123456789':
time_list.append(b)
else:
if ' Rating' in b:
c = b.split(' Rating')
else:
c = b.split(' Rating')
c.append(0.0)
desc = c[0]
desc_list.append(desc)
rating = ''.join(['Rating: ', str(c[1])])
rating_list.append(rating)
try:
time_list_lag = time_list[:]
del time_list_lag[0]
joined_list = zip(time_list, time_list_lag, desc_list, rating_list, date_list)
joined_list2.append(joined_list)
except Exception as exc:
print traceback.format_exc()
pass
soup_parser()
def soup_to_text():
global cat_list, super_list2
for sub in joined_list2:
for e in sub:
super_list2 = []
now = datetime.now()
if channel not in channel_list:
channel_list.append(channel)
if date not in date_check:
date_check.append(date)
if div_list2 not in image_list:
image_list.append(div_list2)
super_list2.append(channel)
super_list2.append(div_list2)
super_list2.append('http://www.tv.sky.com')
super_list2.append(date)
try:
starttime = datetime.strptime(' '.join([str(now.year), e[4], e[0]]), '%Y %a %d %b %H:%M%p').strftime('%Y%m%d%H%M%S')
endtime = datetime.strptime(' '.join([str(now.year), e[4], e[1]]), '%Y %a %d %b %H:%M%p').strftime('%Y%m%d%H%M%S')
global epg_data
clean_channel = str(channel).replace('&', '&')
clean_e2 = str(e[2]).replace('&', '&')
clean_e3 = str(e[3]).replace('&', '&')
epg_data = ''.join(['<program start="',starttime,' +0200" stop="',endtime,' +0200" channel="',clean_channel,'">','<desc lang="eng">',clean_e2,' ',clean_e3,'</desc>','<icon src="',div_list2,'" />', \
'<country>UK</country>','</program>'])
epg_list.append(epg_data)
except Exception as exc:
pass
#print e
#print traceback.format_exc()
super_list.append(super_list2)
cat_list = zip(channel_list, date_check, url_list, image_list)
soup_to_text()
print '**********************************************************************************************************************************************************************'
print '<?xml version="1.0" encoding="UTF-8"?>'
print'<tv generator-info-name="TV Guide Scraper - by Aaron Aardvark" generator-info-url="http://www.aaardvark.com">'
for superx in super_list:
print ''.join(['<channel id="',str(superx[0]),'">'])
print ''.join(['<display-name lang="en">',str(superx[0]),'</display-name>'])
print ''.join(['<icon src="',str(superx[1]),'" />'])
print ''.join(['<url>',str(superx[2]),'</url>'])
print ''.join(['</channel>'])
for epg in epg_list:
print epg
print '</tv>'为了测试它,我现在只抓取了一个通道。我在NextPVR后端和一个在线解析工具中都进行了尝试,并获得了相同的结果。我的输出正确地生成了一个频道列表,但没有节目数据。
我已经将我精简后的输出包括在下面。有没有人看到我做错了什么?
谢谢
<?xml version="1.0" encoding="UTF-8"?>
<tv generator-info-name="TV Guide Scraper - by Aaron Aardvark" generator-info-url="http://www.aaardvark.com">
<channel id="BBC News">
<display-name lang="en">BBC News</display-name>
<icon src="http://my.tvguide.co.uk/channel_logos/60x35/66.png" />
<url>http://www.tv.sky.com</url>
</channel>
<program start="20180506060000 +0200" stop="20180506070000 +0200" channel="BBC News"><desc lang="eng">Breakfast A round-up of national and international news, plus sports reports, weather forecasts and arts and entertainment features (Subtitles) Rating: : 1.5</desc><icon src="http://my.tvguide.co.uk/channel_logos/60x35/66.png" /><country>UK</country></program>
</tv>发布于 2018-05-07 21:09:21
如果有人遇到同样的问题,最终解决了这个问题。标记应为<programme>,而不是<program>。
https://stackoverflow.com/questions/50201769
复制相似问题