我正在编写一个下载程序,它会将url拆分为各个部分并使用线程下载,可能我不会使用“联接”,因为join =无法流(如果所有线程都未完成,则无法写入文件)。
但是问题是f.seek和写输出的文件真的很奇怪,文件的内容总是有"NUL“字符(在Notepad++中),而文件中的文本只占整个文件的1/3。
大家,谢谢大家的帮助,这是我的2.0版代码,感谢Padraic Cunningham的建议和感叹,我修改我的代码就像你建议的那样:所以请帮我检查代码,我想需要你帮我把它转换成http.server文件流处理方法:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
countsize = 0
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()
def buildRange(url, numsplits):
global pool
value = int(requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None))
print("Fullsize: ", value)
print("Try devide with 3 :", value / 3)
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (i * value//numsplits + 1, i * value//numsplits + 1 + (value - (i * value//numsplits + 1))))
if i == 0:
lst.append('%s-%s' % (0, value//numsplits))
else:
lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits))
return lst
def main(url=None, splitBy=3):
global fileName, pool, countsize
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
#fileName = "1.jpg"
#print("%s bytes to download." % sizeInBytes)
# if not sizeInBytes:
# print("Size cannot be determined.")
# return
#sinzeInBytes = buildRange(url,
dataDict = {}
f = open(fileName,'wb')
# split total num bytes into ranges
#ranges = buildRange(url,int(sizeInBytes), splitBy)
ranges = buildRange(url, splitBy)
print(ranges)
def downloadChunk(idx, irange):
print(idx)
#time.sleep(1*idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
#print(data)
#print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
print(offset)
# print(irange)
f.seek(offset, 0)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
countsize = countsize + offset
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
print(countsize)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
if os.path.exists(fileName):
os.remove(fileName)
main(URL, splitBy=16)下面是我的代码,请帮助我修复它: 1.0版本,忽略它,2.0版本:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(value - round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
global fileName, pool
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
#fileName = "1.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print("%s bytes to download." % sizeInBytes)
if not sizeInBytes:
print("Size cannot be determined.")
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
print(idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
print(data)
print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
#print(offset)
# print(irange)
f = open(fileName,'wb')
f.seek(offset)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
main(URL, splitBy=3)发布于 2015-05-31 10:23:50
在目标函数为downloadChunk的情况下,使用三个线程,使用wb打开文件三次,覆盖,从而获得1/3的内容。你也会无缘无故地呼吁寻求。如果您想附加到文件中,每次都要使用a打开,或者只在函数之外打开一次。您正在尝试使用空文件进行查找并写入,因此空字节就是从那里来的。
如果您想打开一个用于读写的文件,那么您可以使用行缓冲来查找:
with open("whatever.file", "r+b",buffering=1) as f然后使用该文件进行写入,不要一直在函数中打开和覆盖,该文件也必须存在。
https://stackoverflow.com/questions/30556258
复制相似问题