关键词提取。pynlpir库实现关键词提取。
# coding:utf-8import sysimport importlibimportlib.reload(sys)import pynlpirpynlpir.open()s = '怎么才能把电脑里的垃圾文件删除'key_words = pynlpir.get_key_words(s, weighted=True)for key_word in key_words: print(key_word[0], 't', key_word[1])pynlpir.close()百度接口:https://www.baidu.com/s?wd=机器学习 数据挖掘 信息检索
安装scrapy pip install scrapy。创建scrapy工程 scrapy startproject baidu_search。做抓取器,创建baidu_search/baidu_search/spiders/baidu_search.py文件。
# coding:utf-8import sysimport importlibimportlib.reload(sys)import scrapyclass BaiduSearchSpider(scrapy.Spider): name = "baidu_search" allowed_domains = ["baidu.com"] start_urls = [ "https://www.baidu.com/s?wd=电脑 垃圾 文件 删除" ] def parse(self, response): filename = "result.html" with open(filename, 'wb') as f: f.write(response.body)修改settings.py文件,ROBOTSTXT_OBEY = False,USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' ,DOWNLOAD_TIMEOUT = 5 ,
进入baidu_search/baidu_search/目录,scrapy crawl baidu_search 。生成result.html,正确抓取网页。
语料提取。搜索结果只是索引。真正内容需进入链接。分析抓取结果,链接嵌在class=c-container Div h3 a标签 href属性。url添加到抓取队列抓取。提取正文,去掉标签,保存摘要。提取url时,提取标题和摘要,scrapy.Request meta传递到处理函数parse_url,抓取完成后能接到这两个值,提取content。完整数据:url、title、abstract、content。
# coding:utf-8import sysimport importlibimportlib.reload(sys)import scrapyfrom scrapy.utils.markup import remove_tagsclass BaiduSearchSpider(scrapy.Spider): name = "baidu_search" allowed_domains = ["baidu.com"] start_urls = [ "https://www.baidu.com/s?wd=电脑 垃圾 文件 删除" ] def parse(self, response): # filename = "result.html" # with open(filename, 'wb') as f: # f.write(response.body) hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract() # for href in hrefs: # print(href) # yield scrapy.Request(href, callback=self.parse_url) containers = response.selector.xpath('//div[contains(@class, "c-container")]') for container in containers: href = container.xpath('h3/a/@href').extract()[0] title = remove_tags(container.xpath('h3/a').extract()[0]) c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract() abstract = "" if len(c_abstract) > 0: abstract = remove_tags(c_abstract[0]) request = scrapy.Request(href, callback=self.parse_url) request.meta['title'] = title request.meta['abstract'] = abstract yield request def parse_url(self, response): print(len(response.body)) print("url:", response.url) print("title:", response.meta['title']) print("abstract:", response.meta['abstract']) content = remove_tags(response.selector.xpath('//body').extract()[0]) print("content_len:", len(content))参考资料:
《Python 自然语言处理》
http://www.shareditor.com/blogshow/?blogId=43
http://www.shareditor.com/blogshow?blogId=76
欢迎推荐上海机器学习工作机会,我的微信:qingxingfengzi
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。