目前,我想推荐哪些地方可以更改,哪些地方可以用来改进简单性和模块化。如果我正在实践良好的命名约定,以提供干净、可读的代码。任何批评都是非常感谢的。
import time
import re
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import time
SESSION = requests.Session()
""" This is the Google Analytics Selector Section """
class myGoogleSession:
def fetch_google_xml(self, URL, country_code):
format_url = f"{URL}{country_code}"
response = SESSION.get(format_url)
soup = BeautifulSoup(response.text, 'xml',
parse_only=SoupStrainer('channel'))
return soup
google_session = myGoogleSession()
def google_trends_retriever(URL, country_code):
xml_soup = google_session.fetch_google_xml(URL, country_code)
print(country_code)
return[(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(xml_soup.find_all('title')[1:],
xml_soup.find_all('ht:approx_traffic'))]
def create_pdTrend(data):
check_panda = pd.DataFrame(
google_trends_retriever(GoogleURL, data),
columns=['Title', 'Score']
)
if len(check_panda) == 0:
print('No available data')
else:
return check_panda
""" This is the Country Code Selector Section """
country_code_list = []
class myCountryCodeSession:
def fetch_countrycode_html(self, URL):
response = SESSION.get(URL)
soup = BeautifulSoup(response.text, 'html.parser',
parse_only=SoupStrainer('table'))
return soup
countryCode_session = myCountryCodeSession()
def parse_row(url):
rows = countryCode_session.fetch_countrycode_html(url)
_rows = rows.findChildren(['td', 'tr'])
for row in _rows:
cells = row.findChildren('td')[2:3]
for cell in cells:
value = cell.string
country_code_list.append(value[:2])
return None
def create_pdCountryCode(country_code):
return pd.DataFrame({'Country_Code': country_code})
def iterate_List(data):
i = 1
while i <= 239:
selected_CountryCode = get_data_fromList(i)
print(create_pdTrend(selected_CountryCode))
i += 1
else:
print('Has reach the end of i ' + str(i))
def get_data_fromList(num):
key = num-1
for i in country_code_list[key:num]:
return str(i)
if __name__ == '__main__':
""" URL Section """
GoogleURL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
CountryCodeURL = "https://countrycode.org/"
"""-------------"""
start = time.time()
print("hello")
"""Country Code Section """
parse_row(CountryCodeURL)
"""---------------------"""
"""Google Analytics Section """
iterate_List(country_code_list)
"""-------------------------"""
end = time.time()
print(end - start)发布于 2019-01-17 10:26:13
这是正式的Python风格指南。如果您对良好的命名约定和其他良好实践感兴趣,可以从这里开始。
除其他外,您的代码将受益于:
lower_snake_case;PascalCase;#分隔的注释,而不是代码中的原始字符串;第一次重写将产生:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
SESSION = requests.Session()
# This is the Google Analytics Selector Section
class GoogleSession:
def fetch_google_xml(self, URL, country_code):
response = SESSION.get(f"{URL}{country_code}")
return BeautifulSoup(
response.text, 'xml',
parse_only=SoupStrainer('channel'))
google_session = GoogleSession()
def google_trends_retriever(URL, country_code):
xml_soup = google_session.fetch_google_xml(URL, country_code)
print(country_code)
titles = xml_soup.find_all('title')[1:]
traffics = xml_soup.find_all('ht:approx_traffic')
return [
(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(titles, traffics)
]
def create_pd_trend(data):
check_panda = pd.DataFrame(
google_trends_retriever(google_URL, data),
columns=['Title', 'Score'],
)
if len(check_panda) == 0:
print('No available data')
else:
return check_panda
# This is the Country Code Selector Section
country_code_list = []
class CountryCodeSession:
def fetch_country_code_html(self, URL):
response = SESSION.get(URL)
return BeautifulSoup(
response.text, 'html.parser',
parse_only=SoupStrainer('table'))
country_code_session = CountryCodeSession()
def parse_row(url):
rows = country_code_session.fetch_country_code_html(url)
for row in rows.find_all(['td', 'tr']):
cells = row.find_all('td')[2:3]
for cell in cells:
value = cell.string
country_code_list.append(value[:2])
def iterate_list(data):
i = 1
while i <= 239:
selected_country_code = get_data_from_list(i)
print(create_pd_trend(selected_country_code))
i += 1
else:
print('Has reach the end of i', i)
def get_data_from_list(num):
key = num - 1
for i in country_code_list[key:num]:
return str(i)
if __name__ == '__main__':
# URL Section
google_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
country_code_URL = "https://countrycode.org/"
# -------------
start = time.time()
print("hello")
# Country Code Section
parse_row(country_code_URL)
# ---------------------
# Google Analytics Section
iterate_list(country_code_list)
# -------------------------
end = time.time()
print(end - start)当我看到
def get_data_fromList(num):key = num-1表示country_code_list键:num中的i:返回str(i)
我想知道为什么你会写这么复杂的代码。提取一个元素的子列表来迭代它,并返回第一个…你可以把它简化为
def get_data_from_list(num):
return str(country_code_list[num - 1])但是我想知道为什么要使用这种方法,并且看到了如何迭代索引来调用这个函数。不要。使用for-循环,因为它是要使用的:直接迭代内容。
这将产生:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
SESSION = requests.Session()
# This is the Google Analytics Selector Section
class GoogleSession:
def fetch_google_xml(self, URL, country_code):
response = SESSION.get(f"{URL}{country_code}")
return BeautifulSoup(
response.text, 'xml',
parse_only=SoupStrainer('channel'))
google_session = GoogleSession()
def google_trends_retriever(URL, country_code):
xml_soup = google_session.fetch_google_xml(URL, country_code)
print(country_code)
titles = xml_soup.find_all('title')[1:]
traffics = xml_soup.find_all('ht:approx_traffic')
return [
(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(titles, traffics)
]
def create_pd_trend(data):
check_panda = pd.DataFrame(
google_trends_retriever(google_URL, data),
columns=['Title', 'Score'],
)
if len(check_panda) == 0:
print('No available data')
else:
return check_panda
# This is the Country Code Selector Section
class CountryCodeSession:
def fetch_country_code_html(self, URL):
response = SESSION.get(URL)
return BeautifulSoup(
response.text, 'html.parser',
parse_only=SoupStrainer('table'))
country_code_session = CountryCodeSession()
def parse_row(url):
rows = country_code_session.fetch_country_code_html(url)
return [
cell.string[:2]
for row in rows.find_all(['td', 'tr'])
for cell in row.find_all('td')[2:3]
]
def iterate_list(country_codes):
for country_code in country_codes:
print(create_pd_trend(str(country_code)))
else:
print('Has reach the end of i', len(country_codes))
if __name__ == '__main__':
# URL Section
google_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
country_code_URL = "https://countrycode.org/"
# -------------
start = time.time()
print("hello")
# Country Code Section
country_code_list = parse_row(country_code_URL)
# ---------------------
# Google Analytics Section
iterate_list(country_code_list)
# -------------------------
end = time.time()
print(end - start)您的类绝对不会对单个函数添加任何值。您不存储在每次调用之后重用的状态。您不能在多个函数之间共享状态。它们是名称空间中的普通函数,让它们成为简单的函数。
这段代码可以从使用类中获益,但不是这样的。
lxml,它是在指示BeautifulSoup解码'xml'时使用的底层解析器,它显式地处理原始字节,而不是解码文本。这是为了能够检测到显式编码声明,并对文档的其余部分进行适当的解码;这样您就不会有解码错误。
这意味着在解析XML时,需要将response.content而不是response.text提供给BeautifulSoup。
您的代码在很大程度上依赖于全局变量和printing数据。这是代码中最糟糕的部分,因为它使代码几乎无法重用,并且很难正确测试(比如unittest或doctest)。
不要使用全局变量,而是将它们作为参数传递,并从函数中返回它们。
而不是打印结果,而是从函数返回值。这使得提取和按摩数据变得更容易。
还有在整个代码中使用的全局SESSION。我会将它封装到一个类中,以便每个实例都有一个会话,这样,如果需要的话,您可以轻松地爬行几个地址。
我对这个问题的看法是:
import re
from functools import partial
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
class GoogleAnalysis:
def __init__(self, url):
session = requests.Session()
self.get_url = partial(session.get, url)
def _fetch_xml(self, country_code):
response = self.get_url(params={'geo': country_code})
return BeautifulSoup(
response.content, 'xml',
parse_only=SoupStrainer('channel'))
def _retrieve_trends(self, country_code):
soup = self._fetch_xml(country_code)
titles = soup.find_all('title')[1:]
traffics = soup.find_all('ht:approx_traffic')
return [
(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(titles, traffics)
]
def trends(self, country_code):
df = pd.DataFrame(
self._retrieve_trends(country_code),
columns=['Title', 'Score'],
)
df['Country Code'] = country_code
return df
def country_codes(url='https://countrycode.org/'):
response = requests.get(url)
soup = BeautifulSoup(
response.text, 'lxml',
parse_only=SoupStrainer('table'))
return [
cell.string[:2]
for row in soup.find_all(['td', 'tr'])
# Some rows don't define row.find_all('td')[2] so filter out
for cell in row.find_all('td')[2:3]
]
def main(url):
google = GoogleAnalysis(url)
codes = country_codes()
return pd.concat([
google.trends(country_code)
# Country codes are repeated twice, we only need them once
for country_code in codes[:len(codes) // 2]
])
if __name__ == '__main__':
import time
start = time.perf_counter()
print('Hello!')
trends = main('https://trends.google.com/trends/trendingsearches/daily/rss')
print(trends.to_string(index=False))
print(time.perf_counter() - start)注意,最后的print(trends.to_string(index=False))可能是您喜欢的任何东西,要么打印到CSV,要么使用trends.groupby重做旧格式。这里的想法是,计算是在没有任何print的情况下完成的。您可以在最后格式化数据,不管您喜欢什么。
https://codereview.stackexchange.com/questions/211654
复制相似问题