import pandas as pd
from pandas import ExcelWriter
import os
d1 = "D:\master concordia\DYD\Ecobee DYD\canada\y\Canada 2015"
d2 = "D:\master concordia\DYD\Ecobee DYD\canada\y\Canada 2016"
d3 = "D:\master concordia\DYD\Ecobee DYD\canada\y\Canada 2017"
d4 = "D:\master concordia\DYD\Ecobee DYD\canada\y\Canada 2018"
d5 = "D:\master concordia\DYD\Ecobee DYD\canada\y\Canada 2019"
output_dir = "D:\master concordia\DYD\Ecobee DYD\canada\y\new"
files_2015= os.listdir(d1)
files_2016= os.listdir(d2)
files_2017= os.listdir(d3)
files_2018= os.listdir(d4)
files_2019= os.listdir(d5)我有5个文件夹,其中包含800个excel文件,每个文件夹中的excel文件都有相同的名称,我想创建一个循环,这样我就可以将5个同名的excel文件组合成一个文件。有人能帮帮我吗?
发布于 2019-11-15 07:10:09
我没有安装pandas,所以虽然这不能回答您的整个问题,但它确实展示了如何迭代地从每个目录中获取具有相同名称的相应文件组,打开它们,然后对它们进行处理。
注意:出于测试目的,我缩短了根目录路径。
from contextlib import contextmanager
from glob import glob
#import pandas as pd
#from pandas import ExcelWriter
import os
@contextmanager
def multi_file_manager(filenames, mode='r'):
""" Open multiple files and make sure they all get closed. """
files = [open(filename, mode) for filename in filenames]
yield files
for file in files:
file.close()
root_dir = "Ecobee DYD"
folder_names = ("Canada 2015", "Canada 2016", "Canada 2017", "Canada 2018", "Canada 2019")
folder_paths = (os.path.join(root_dir, subfolder, "*.xlsx") for subfolder in folder_names)
folder_iters = [iter(sorted(glob(folder_path))) for folder_path in folder_paths]
while True:
try:
# Get next group of filenames.
xl_filenames = [next(folder_iter) for folder_iter in folder_iters]
except StopIteration:
break
with multi_file_manager(xl_filenames) as inp_files: # Open all in group.
print('processing files:', ', '.join(['"{}"'.format(f.name) for f in inp_files]))
# Combine group of open files in inp_files using pandas...
print('done')示例输出:
processing files: "Ecobee DYD\Canada 2015\file1.xlsx", "Ecobee DYD\Canada 2016\file1.xlsx", "Ecobee DYD\Canada 2017\file1.xlsx", "Ecobee DYD\Canada 2018\file1.xlsx", "Ecobee DYD\Canada 2019\file1.xlsx"
processing files: "Ecobee DYD\Canada 2015\file2.xlsx", "Ecobee DYD\Canada 2016\file2.xlsx", "Ecobee DYD\Canada 2017\file2.xlsx", "Ecobee DYD\Canada 2018\file2.xlsx", "Ecobee DYD\Canada 2019\file2.xlsx"
processing files: "Ecobee DYD\Canada 2015\file3.xlsx", "Ecobee DYD\Canada 2016\file3.xlsx", "Ecobee DYD\Canada 2017\file3.xlsx", "Ecobee DYD\Canada 2018\file3.xlsx", "Ecobee DYD\Canada 2019\file3.xlsx"
done更新
我不能实际测试以下内容,因为,正如我所说的,我没有安装pandas,但是我认为它可以做您想要的事情。我已经使用pandas添加了读取、组合和写入结果的代码,但它们都被注释掉了。
我还将其更改为使用pathlib来处理每个子目录中的文件分组,因为它稍微简单一些。
它创建并放入output_folder的组合文件被命名为filename_combined.xlsx,这是因为您在问题中没有提到它们的名称。
from collections import defaultdict
from glob import glob
from itertools import chain, groupby
##import pandas as pd
##from pandas import ExcelWriter
from pathlib import Path
from pprint import pprint, pformat
from textwrap import indent
root_folder = Path('Ecobee DYD/canada/y') # Shortened for testing.
subfolders = 'Canada 201[5-9]/*.xlsx'
all_excels = sorted(filepath for filepath in root_folder.glob(subfolders))
output_folder = Path("D:/master concordia/DYD/Ecobee DYD/canada/y/new")
# Create dictionary of unique file names and a list of all the paths to each.
groups = defaultdict(list)
for k, g in groupby(all_excels):
groups[Path(k.name)].append(next(g))
# Process each group of files.
for filename, xl_filenames in chain(groups.items()):
print('Combining files:')
print(indent(pformat(xl_filenames), ' '*4))
## dfs = [pd.read_excel(xl_filename) for xl_filename in xl_filenames]
## df = pd.concat(dfs)
outputfile_path = output_folder / (filename.stem + '_combined' + filename.suffix)
print(' Result: "{}"'.format(outputfile_path.as_posix()))
## with ExcelWriter(outputfile_path) as writer:
## df.to_excel(writer)
print('done')示例输出:
Combining files:
[WindowsPath('Ecobee DYD/canada/y/Canada 2015/file1.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2016/file1.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2017/file1.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2018/file1.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2019/file1.xlsx')]
Result: "D:/master concordia/DYD/Ecobee DYD/canada/y/new/file1_combined.xlsx"
Combining files:
[WindowsPath('Ecobee DYD/canada/y/Canada 2015/file2.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2016/file2.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2017/file2.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2018/file2.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2019/file2.xlsx')]
Result: "D:/master concordia/DYD/Ecobee DYD/canada/y/new/file2_combined.xlsx"
Combining files:
[WindowsPath('Ecobee DYD/canada/y/Canada 2015/file3.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2016/file3.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2017/file3.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2018/file3.xlsx'),
WindowsPath('Ecobee DYD/canada/y/Canada 2019/file3.xlsx')]
Result: "D:/master concordia/DYD/Ecobee DYD/canada/y/new/file3_combined.xlsx"
done发布于 2019-11-15 11:52:01
使用python标准库中的pathlib。
from pathlib import Path
import pandas as pd我们可以使用rglob递归地在目录树中搜索与某个模式匹配的所有文件
folder_to_search = 'D:\master concordia\DYD\Ecobee DYD\canada'
pat_ = 'ffb034be30cd9b2d19e8f443b6232db74a270d4c'
all_excels = [f for f in Path(folder_to_search).rglob(f'*{pat_}*.xlsx')]然后,我们需要将它们连接成单个pandas数据帧,记住excel文件的形状必须相同才能得到标准的数据帧。
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in all_excels]
df = pd.concat(dfs)有关Pathlib的更多信息,请单击此处:https://realpython.com/python-pathlib/
https://stackoverflow.com/questions/58866442
复制相似问题