我创建了简单的代码,其中它接受平面文件(txt),消除噪音和错误,只返回干净的行,即用我感兴趣的数据返回平面文件。它工作良好,但由于我是一个初学者,我相信我的代码是没有效率的。如果你能引导我让它变得更专业、更有效率,我将不胜感激
import pandas as pd
import numpy as np
def openFile(path, type):
file = open(path,type)
return file
def getFileDelim(f_path, f_del):
file = openFile(f_path, 'r')
lines = file.readlines()
cnt_list = np.array([0])
cnt_list = np.delete(cnt_list,0)
for i in range(100):
cnt_list = np.append(cnt_list,lines[i].count(f_del))
return np.bincount(cnt_list).argmax()
def getHeaders(f_path):
file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'r')
file_o_ho = openFile(f_path.replace('.txt', '_ok_header_ok.txt'), 'w')
file_o_hn = openFile(f_path.replace('.txt', '_ok_header_noise.txt'), 'w')
lines = file_o.readlines()
cnt = 0
for line in lines:
if cnt == 0:
header = line
file_o_ho.write(line)
else:
if line != header:
file_o_ho.write(line)
else:
file_o_hn.write(line)
cnt += 1
def fileProcessing(f_path, f_del, del_cnt):
file = openFile(f_path, 'r')
file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'w')
file_n = openFile(f_path.replace('.txt', '_noise.txt'), 'w')
file_e = openFile(f_path.replace('.txt', '_error.txt'), 'w')
lines = file.readlines()
for line in lines:
if line.count("|") == del_cnt:
file_o.write(line)
elif line.count("|") > 0:
file_n.write(line)
elif line.count("|") == 0:
file_e.write(line)
header_process = input('Do you want to analyse the header record of this file (y/n)?:')
if header_process.lower() == 'y' or header_process.lower() == 'yes':
getHeaders(f_path)
def generateReport(f_path, f_del, del_count):
output = "File Processing report:\n"
output += "............................................................................................\n"
output += "File Path: {}\n".format(f_path)
output += "Delimeter: {}\n".format(f_del)
output += "Delimeter Count: {}\n".format(del_count)
output += "\n"
file = openFile(f_path, 'r')
file_o = openFile(f_path.replace('.txt', '_ok.txt'), 'r')
file_n = openFile(f_path.replace('.txt', '_noise.txt'), 'r')
file_e = openFile(f_path.replace('.txt', '_error.txt'), 'r')
f_cnt = len(file.readlines())
f_o_cnt = len(file_o.readlines())
f_e_cnt = len(file_n.readlines())
f_n_cnt = len(file_e.readlines())
output += "Original File Line Count: {}\n".format(f_cnt)
output += "Number of ok lines: {}\n".format(f_o_cnt)
output += "Number of noise lines: {}\n".format(f_n_cnt)
output += "Number of error lines: {}\n".format(f_e_cnt)
output += "Total number of lines assessed: {}\n".format(f_o_cnt + f_n_cnt + f_e_cnt)
file_r = openFile(f_path.replace('.txt', '_report.txt'), 'w')
file_r.write(output)
return output
def main (f_path,f_del,del_cnt = None):
if del_cnt == None:
del_cnt = getFileDelim(f_path, f_del)
fileProcessing(f_path, f_del, del_cnt)
print(generateReport(f_path, f_del, del_cnt))
f_path = input("Please provide file path:").replace('"','')
f_del = input("Provide the delimeter:")
main(f_path,f_del)我将处理的原始文件如下图所示

15.04.2021 Lieferungen mit Auftragsdaten 1
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Lieferungen mit Auftragsdaten
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Lieferung | Pos|Angel.am |VStl|LFArt|Werk|Warenempf.|IstWA Dat |Material |Materialnummer |Liefermenge|ME |MS|Auftragsart|Aufragsdatum|Auftragsmenge|ME |VB|Werk|Lieferung |Verkaufsb.|
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|9888477351|000010|24.01.2020|LC01|ZLSO |0156|V00000DEB7|24.01.2020|01.93105-9780|RO M3313-C-E235+N-M3574-B21 20X2 HL 6000 | 18 |M |35|ZSO1 |20.01.2020 | 18 |M |01|0156|9888477351|214898993 |
|9888401282|000010|11.01.2020|LC03|ZLSO |0156|V00000CH01|13.01.2020|04.33335-9950|BODENBELAG PVC-MAN323-M-2000-F5251A-GAYA MOSAIC NT4482 BABEL RO24M | 24 |M |35|ZSO1 |20.12.2019 | 24 |M |01|0156|9888401282|214805942 |
|9888437256|000070|17.01.2020|LC01|ZLTA |0156|V00000DE33|20.01.2020|04.38235-9315|SCHWEISSCHNUR PVC FRAN RO100M DUNKELGRAU | 40 |M |35|ZTA1 |16.01.2020 | 40 |M |02|0156|9888437256|214888602 |
|9888363103|000010|06.01.2020|LC01|ZLSO |0156|V00000DE78|06.01.2020|04.38235-9315|SCHWEISSCHNUR PVC FRAN RO100M DUNKELGRAU | 20 |M |35|ZSO1 |06.01.2020 | 20 |M |01|0156|9888363103|214834613 |
|9888411482|000010|14.01.2020|LC03|ZLSO |0156|V008005037|14.01.2020|33.02640-0034|HALTER | 1 |ST |35|ZSO1 |18.12.2019 | 1 |ST |01|0156|9888411482|214795849 |
|9888470166|000010|23.01.2020|LC01|ZLTA |0156|V00000NO00|23.01.2020|33.25140-0010|HALTER RE | 1 |ST |35|ZTA1 |25.11.2019 | 1 |ST |02|0156|9888470166|214696718 |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
15.04.2021 Lieferungen mit Auftragsdaten 2
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Lieferung | Pos|Angel.am |VStl|LFArt|Werk|Warenempf.|IstWA Dat |Material |Materialnummer |Liefermenge|ME |MS|Auftragsart|Aufragsdatum|Auftragsmenge|ME |VB|Werk|Lieferung |Verkaufsb.|
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|9888502930|000010|29.01.2020|LC01|ZLSO |0156|V000050401|29.01.2020|81.25456-6921|ZSB KABELSTRANG FHS ZUSATZHEIZUNG D5W OHNE TRS | 1 |ST |35|ZSO1 |16.12.2019 | 1 |ST |01|0156|9888502930|214785306 |
|9888503710|000010|29.01.2020|LC01|ZLTA |0156|V00000AT29|29.01.2020|81.25456-6982|ZSB KABELSTRANG RAHMEN LINKS BEI EBS-5 | 1 |ST |35|ZTA1 |11.12.2019 | 1 |ST |02|0156|9888503710|214764484 |
|9888453594|000010|21.01.2020|LC03|ZLSO |0156|V00000GB08|21.01.2020|81.25459-5207|ZSB KABELSTRANG SCHEINWERFER LI RANGIERLEUCHTE | 1 |ST |35|ZSO1 |12.12.2019 | 1 |ST |01|0156|9888453594|214774049 |
|9888477288|000010|24.01.2020|LC03|ZLSO |0156|V00000GB17|24.01.2020|81.25459-7711|ZSB KABELSTRANG FHS WINTERDIENSTBELEUCHTUNG | 1 |ST |35|ZSO1 |05.12.2019 | 1 |ST |01|0156|9888477288|214741051 |
|9888485462|000010|25.01.2020|LC03|ZLTA |0156|V00000GB17|27.01.2020|81.25459-7711|ZSB KABELSTRANG FHS WINTERDIENSTBELEUCHTUNG | 1 |ST |35|ZTA1 |20.12.2019 | 1 |ST |02|0156|9888485462|214805832 |发布于 2021-05-18 23:22:22
哇哦。首先,这与Pandas无关(您包含了库,但随后没有使用它);您对Numpy的使用是可疑的。我建议你跳过Numpy。
一份简短的清单,列出应该改变的事情:
np.empty((0,))for i in range(100)都会崩溃。max即可。.replace('.txt'有一个非常有趣的bug。由于您假设所有文件都以.txt结尾,任何传入的文件如果具有不同的扩展名,都会因为被跳过的替换和随后的覆盖而被践踏。用pathlib和茎代替。getHeaders中一样,这是一种代码气味;在这里是这样的。更容易将文件表示为迭代器,只需执行一个next()即可获得第一行。is None而不是== None以下是完成上述操作的一种方法,并强制执行文件只需要打开一次,只需要遍历一次,并且不需要同时保存在内存中。如果文件足够小(例如1GB或更少),那么使用不同的算法将所有行加载到内存中可能会更快,而这并不是这样做的。总之:
from functools import partial
from itertools import islice, tee
from logging import getLogger, StreamHandler, FileHandler, INFO
from pathlib import Path
from typing import Iterable, TextIO, Callable
def setup_logger():
logger = getLogger('clean')
logger.addHandler(StreamHandler())
logger.addHandler(FileHandler(filename='report.txt'))
logger.setLevel(INFO)
return logger
logger = setup_logger()
def get_delim_counts(file: TextIO, delim: str, n_lines: int = 100) -> Iterable[int]:
for line in islice(file, n_lines):
yield line.count(delim)
file.seek(0)
def match_output(
orig_path: Path,
lines: Iterable[str],
stem: str,
delim: str,
delim_pred: Callable[[int], bool],
) -> Iterable:
n = 0
path = orig_path.with_stem(f'{orig_path.stem}_{stem}')
with path.open('w') as f:
for line in lines:
n_delim = line.count(delim)
matched = delim_pred(n_delim)
if matched:
f.write(line)
n += int(matched)
yield
logger.info(f'Number of {stem} lines: {n}')
def match_header(
orig_path: Path,
lines: Iterable[str],
):
ok_path = orig_path.with_stem(f'{orig_path.stem}_ok_header_ok')
noise_path = orig_path.with_stem(f'{orig_path.stem}_ok_header_noise')
with ok_path.open('w') as ok_file, \
noise_path.open('w') as noise_file:
header = next(lines)
ok_file.write(header)
for line in lines:
if line == header:
noise_file.write(line)
else:
ok_file.write(line)
yield
def process(path: Path, delim: str, process_header: bool) -> None:
logger.info(
'File Processing Report:\n'
f'File path: {path.absolute()}\n'
f'Delimiter: {delim}'
)
with path.open() as orig:
delim_max = max(get_delim_counts(orig, delim))
logger.info(f'Delimiter count: {delim_max}')
match_path = partial(match_output, orig_path=path, delim=delim)
process_funs = [
partial(
match_path, stem='ok', delim_pred=lambda n: n == delim_max,
),
partial(
match_path, stem='noise', delim_pred=lambda n: 0 < n < delim_max,
),
partial(
match_path, stem='error', delim_pred=lambda n: n < 1,
),
]
if process_header:
process_funs.append(
partial(match_header, orig_path=path)
)
iters = [
fun(lines=lines)
for fun, lines in zip(
process_funs, tee(orig, len(process_funs))
)
]
for n_lines, _ in enumerate(zip(*iters)):
pass # could put a progress bar here
logger.info(f'Original file line count: {n_lines}')
def main():
process(
path=Path(input('Provide file path: ')),
delim=input('Provide the delimiter: '),
process_header=input(
'Do you want to analyse the header record of this file (y/n)?: '
).lower().startswith('y'),
)
if __name__ == '__main__':
main()https://codereview.stackexchange.com/questions/260911
复制相似问题