我编写了一个脚本来查找KMZ文件中的坐标。我试图使用ProcessPoolExecutor来加快执行速度。我对Python非常陌生,所以任何建议都非常受欢迎。
#!/usr/bin/env python
from zipfile import ZipFile
from lxml import html
import os
import concurrent.futures
def process_file(filename):
results=[]
try:
saved_file=False;
kmz = ZipFile(filename, 'r')
for kml_name in kmz.namelist():
if 'doc.kml' in kml_name:
continue
kml = kmz.open(kml_name, 'r').read()
doc = html.fromstring(kml)
for pm in doc.cssselect('Document Placemark'):
tmp = pm.cssselect('track')
if len(tmp):
# Track Placemark
tmp = tmp[0] # always one element by definition
for desc in tmp.iterdescendants():
content = desc.text_content()
if desc.tag == 'coord':
lon = float(content.split()[0])
lat = float(content.split()[1])
search_lon = -47
search_lat = 47
if (abs(lat - search_lat) <= 1 and abs(lon - search_lon) <= 1):
if not saved_file:
results.append('\nFile: ' + filename + '\n')
saved_file=True
results.append(content + '\n')
except:
pass
return results
def main():
# Search all files
kmz_files = []
for root, subdirs, files in os.walk('raw/L1B_Catalogue'):
for file in files:
if '.kmz' in file:
filename = os.path.join(root,file)
kmz_files.append(filename)
# Parallel execution
f=open('kmz_search_output.txt', 'wt')
count = 0
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
for results in executor.map(process_file, kmz_files, chunksize=10):
print("{} / {}".format(count,len(kmz_files)))
count = count + 1
for line in results:
f.write(line);
pass
f.close()
print('Done')
if __name__ == '__main__':
main()发布于 2018-09-13 09:28:41
process_file是做什么的?它还能返回什么?process_file这个名字很模糊。最好使用特定的名称,例如kmz_coordinates。saved_file逻辑应该是不必要的,因为调用者已经知道他们传递给process_file的是哪个filename。因此,您可以为文件名编写以下内容:executor.map=kmz_coordinates(kmz_coordinates,kmz_files,chunksize=10),压缩中的同弦(kmz_files,结果):print("{} / {}".format( count,len(Kmz_files) count =count+1 f.write(f"File { filename }:\n")用于同弦:f.write(行);使用zip将文件名与结果匹配,并从kmz_coordinates中删除saved_file逻辑。count逻辑也没有必要,因为您可以使用enumerate来生成计数:C20= executor.map(kmz_coordinates,kmz_files,chunksize=10)用于计数,(文件名,和弦)在枚举中(zip(kmz_files,结果)):print("{} / {}".format(count,len(Kmz_files) f.write(f"File {filename}:\n") f.writelines(共弦)try: ... except: pass来抑制异常是个坏主意。这有两个原因。首先,一个普通的except:捕获所有异常,包括KeyboardInterrupt,这可能会使通过键入control来停止程序变得困难。其次,异常可能是由于程序中的but造成的,但是通过抑制它们,您很难发现but。如果您确实有一个很好的理由来抑制异常,那么您应该在可能引发异常的代码行周围尽可能紧密地本地化try: ... except,并且应该选择适当的异常类。例如,如果您担心的问题是lxml.html.fromstring会引发lxml.etree.ParserError,那么编写如下内容: try: doc = html.fromstring(kml),除了lxml.etree.ParserError: lxml.etree.ParserError:继续#跳过这个文件with是个好主意。例如: f=open('kmz_search_output.txt','wt') #.代码使用f ..。f.close()写: with open('kmz_search_output.txt','w')作为f:#.代码使用f ..。这节省了一行代码(不需要显式关闭文件),并确保文件及时关闭,即使在... code using f ...内部出现异常。lxml.etree而不是lxml.html吗?try: ... except:。tmp这个名字很模糊。最好尽可能具体,例如track。coord元素。因此,您可以将其组合成相同的循环:用于doc.cssselect中的coord (‘Document coord'):coord元素的内容: lon = float(content.split()) lat = float(content.split())只分割它一次,然后使用map和元组赋值: lon,lat =map(content.split,content.split())kml_track_coordinates的关键字参数,那么您就可以使用该函数搜索您选择的坐标。这是未经测试的,因此可能包含一些错误。
def kmz_coordinates(filename, search_lon=-47, search_lat=47, tolerance=1):
"""Return list of track coordinates found in a KMZ file that are
within tolerance degrees of (search_lon, search_lat).
"""
coords = []
with ZipFile(filename, 'r') as kmz:
for kml_name in kmz.namelist():
if not kml_name.endswith('.kml') or 'doc.kml' in kml_name:
continue
with kmz.open(kml_name, 'r') as kml:
doc = html.fromstring(kml.read())
for coord in doc.cssselect('Document Placemark track coord'):
content = coord.text_content()
lon, lat = map(float, content.split())
if (abs(lat - search_lat) <= tolerance
and abs(lon - search_lon)) <= tolerance):
coords.append(content + '\n')
return coords
def kmz_search_directory(directory, output_filename='kmz_search_output.txt'):
"""Search directory for KMZ files, find matching coordinates within
the files, and write the results to output_filename.
"""
kmz_files = []
for root, _, files in os.walk(directory):
for filename in files:
if filename.endswith('.kmz'):
kmz_files.append(os.path.join(root, filename))
with open(output_filename, 'w') as f:
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
results = executor.map(kmz_coordinates, kmz_files, chunksize=10)
for count, (filename, coords) in enumerate(zip(kmz_files, results)):
print("{} / {}".format(count, len(kmz_files)))
f.writelines(coords)
print('Done')https://codereview.stackexchange.com/questions/203623
复制相似问题