首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >Scrapy spider以JSON格式导出CSV文件

Scrapy spider以JSON格式导出CSV文件
EN

Stack Overflow用户
提问于 2020-07-26 23:57:30
回答 1查看 278关注 0票数 0

我用Scrapy Spider创建了一个GUI应用程序,但当我将数据保存为CSV时,它不能以正确的格式导出。它以JSON格式导出数据。这个问题的原因和解决方法是什么?您可以在下面的屏幕截图中看到输出。

完整的项目在这里:https://drive.google.com/file/d/1Ztgqi6-dLH6YHJBo-e9R5rwvWdCGOJhD/view?usp=sharing

GUI应用程序代码如下。它是动态的,所以它可以与任何scrapy项目一起工作:

请从DropDown中选择CSV:

代码语言:javascript
复制
from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from scrapy.utils import project
from scrapy import spiderloader
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import threading


def get_spiders():
    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    return spider_loader.list()

def get_chosen_spider(value):
    global chosen_spider
    chosen_spider = value
    return chosen_spider

def get_chosen_feed(value):
    global chosen_feed
    chosen_feed = value
    return chosen_feed


def browse_button():
    global folder_path
    folder_path = filedialog.askdirectory()
    folder_path_entry.delete(0, END)
    folder_path_entry.insert(0, folder_path)
    return folder_path

def execute_spider():
    if dataset_entry.get() == '' or chosen_feed not in ['CSV', 'JSON']:
        messagebox.showerror('Error', 'All entries are required')
        return
    
    try:
        feed_uri = f"file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}"
    except:
        messagebox.showerror('Error', 'All entries all required')
    
    settings = project.get_project_settings()
    settings.set('FEED_URI', feed_uri)
    settings.set('FEED_TYPE', chosen_feed)

    configure_logging()
    runner = CrawlerRunner(settings)
    runner.crawl(chosen_spider)
    
    reactor.run(installSignalHandlers=False)

def start_execute_thread(event):
    global execute_thread
    execute_thread = threading.Thread(target=execute_spider, daemon=True)
    execute_thread.start()
    app.after(10, check_execute_thread)

def check_execute_thread():
    if execute_thread.is_alive():
        app.after(10, check_execute_thread)



app = Tk()

#Spiders list
spider_label = Label(app, text='Choose a spider')
spider_label.grid(row=0 , column=0, sticky=W, pady=10, padx=10)

spider_text = StringVar(app)
spider_text.set('Choose a spider')
spiders = [spider for spider in get_spiders()]

spiders_dropdown = OptionMenu(app, spider_text, *spiders, command=get_chosen_spider)
spiders_dropdown.grid(row=0, column=1, columnspan=2)

# Feed Type
feed_label = Label(app, text='Choose a feed')
feed_label.grid(row=1 , column=0, sticky=W, pady=10, padx=10)

feed_text = StringVar(app)
feed_text.set('Choose a feed')
feeds = ['JSON', 'CSV']

feed_dropdown = OptionMenu(app, feed_text, *feeds, command=get_chosen_feed)
feed_dropdown.grid(row=1, column=1, columnspan=2)

# Path Entry
folder_path_text = StringVar(app)
folder_path_entry = Entry(app, textvariable=folder_path_text)
folder_path_entry.grid(row=2, column=0, pady=10, padx=10)

# Dataset Entry
dataset_text = StringVar(app)
dataset_entry = Entry(app, textvariable=dataset_text, width=10)
dataset_entry.grid(row=2, column=1, pady=10, padx=10)

browse_btn = Button(app, text='Browse', command=browse_button)
browse_btn.grid(row=2, column=2)

#update this one too
execute_btn = Button(app, text='Execute', command=lambda: start_execute_thread(None))
execute_btn.grid(row=3, column=0, columnspan=3)

app.title('Spider Executer')
app.geometry('300x200')
app.resizable(False, False)
app.mainloop()
EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2020-07-27 12:09:31

请更改此行:

代码语言:javascript
复制
settings.set('FEED_TYPE', chosen_feed)

至:

代码语言:javascript
复制
settings.set('FEED_FORMAT', chosen_feed)
票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/63102476

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档