import requests
import json
import urllib
import lyricsgenius
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.dbsparta
def get_artist_id(artistName):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/artist.search?page_size=100&format=json&apikey=123&q_artist=" + artistName, headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['artist_list'][0]['artist']['artist_id']
# print(response.json()['message']['body']['artist_list'][0]['artist']['artist_id'])
def get_album_ids(artist_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
album_response = requests.get("https://api.musixmatch.com/ws/1.1/artist.albums.get?page_size=100&format=json&apikey=123&artist_id=" + str(artist_id), headers=headers)
album_response.encoding = 'UTF-8'
# counter = 0
# album_list = album_response.json()['message']['body']['album_list']
return album_response.json()['message']['body']['album_list']
# print(album_response.json()['message']['body']['album_list'])
# for album in album_list:
# # counter += 1
# print(album['album']['album_id'])
def get_album_tracks_ids(album_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/album.tracks.get?page_size=100&format=json&apikey=123&album_id=" + str(album_id), headers=headers)
response.encoding = 'UTF-8'
return response.json()['message']['body']['track_list']
# def get_track_id(artist_id):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# response = requests.get("https://api.musixmatch.com/ws/1.1/track.search?page_size=100format=json&apikey=123&f_artist_id=" + str(artist_id), headers=headers)
# response.encoding = 'UTF-8'
# for tracks in response.json()['message']['body']['track_list']:
# print(tracks['track']['track_name'])
def get_track_lyrics(track_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get("https://api.musixmatch.com/ws/1.1/track.lyrics.get?apikey=123&track_id=" + str(track_id), headers=headers)
response.encoding = 'UTF-8'
# return response['message']['body']['lyrics']['lyrics_body']
return response.json()['message']['body']['lyrics']['lyrics_body']
def main():
stars_list = list(db.new_top200.find({}, {'_id': 0}))
for stars in stars_list:
print(stars['name'])
album_ids = get_album_ids(get_artist_id(stars['name']))
# if album_ids is not None:
for album_id in album_ids:
# if album_id is not None and get_album_tracks_ids(album_id['album']['album_id']) is not [] and get_album_tracks_ids(album_id['album']['album_id']) is not None:
track_ids = get_album_tracks_ids(album_id['album']['album_id'])
for track in track_ids:
# if track is not [] and track['track']['track_id'] is not [] and track is not None:
# if get_track_lyrics(track['track']['track_id']) is not [] and get_track_lyrics(track['track']['track_id']) is not None:
lyric = get_track_lyrics(track['track']['track_id'])
db.new_top200.update_one({'name': stars['name']},{'$push': {'lyrics': lyric } })
# get_track_id(get_artist_id('Kanye West'))
# get_album_ids(get_artist_id("Kanye West"))
# get_album_tracks(15565713)
if __name__ == "__main__":
# for album in get_album_ids(get_artist_id("Kanye West")):
# get_album_tracks_ids(album['album']['album_id'])
# get_track_lyrics(96610952)
# get_album_tracks_ids(15565713)
# get_album_ids(get_artist_id('Drake'))
main()我正在尝试获取一个艺术家的所有歌词,并将其存储在数据库中。例如,如果艺术家是"Drake“,我希望将所有歌词存储在数据库中的”lyrics“关键字中。
但是,每次运行相同的代码时,我都会收到一堆不可预测的错误。例如,它插入400个歌词没有任何问题,突然我会得到一个错误,说‘列表索引必须是整数或切片不是字符串’。这个错误让我非常困惑,因为我假设所有的json数据都是相同的格式,并且在处理400个歌词之后突然出现错误,而enter image description here没有问题
我可以运行相同的代码,大约200个歌词,我会得到一个json解码错误,然后当我可以再次运行它时,在处理不同数量的歌词后,我会再次得到我在开头描述的错误。
有人能解释一下这个错误的随机性吗?
谢谢!
发布于 2020-06-18 23:10:53
您正在对将从JSON返回的数据类型进行假设。在您的例子中,我怀疑其中一个json元素是一个列表,而不是一个对象。
你的问题可以用这个简单的例子来重现:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
print(my_dict['message']['body']['lyrics']['lyrics_body'])提供:
TypeError: list indices must be integers or slices, not str你怎么解决它呢?您需要检查每个元素是否与您期望的元素匹配;例如:
my_dict = {
'message': {
'body': {
'lyrics': ['Always look on the bright side of life']
}
}
}
def checker(item, field):
if isinstance(item, dict):
return item.get(field)
else:
raise ValueError(f"'{item}' in field '{field}' is not a valid dict")
message = checker(my_dict, 'message')
body = checker(message, 'body')
lyrics = checker(body, 'lyrics')
print(checker(lyrics, 'lyrics'))提供:
ValueError: '['Always look on the bright side of life']' in field 'lyrics' is not a valid dicthttps://stackoverflow.com/questions/62442601
复制相似问题