我正在做一个网页抓取练习,抓取的单词是按出现次数计算的。我想将统计的单词和频率转换为数据帧并保存为excel格式。
我已经尝试了所有的例子,但都不起作用。我想转换这个列表(top),它看起来像这样
Print (top)
[('the', 1)]
[('one', 1)]
[('of', 1)]
[('the', 1)]
[('most', 1)]
...........转换成如下的数据帧:
index Word count
.. the 1
.. one 1
.. of 1
.. the 1
.. most 1
.. ... ..是代码的下半部分
for word in clean_list:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
#To get count of each word in
#the crawled page -->
c = Counter(word_count)
# returns the most occuring elements
top = c.most_common(100)这是我的代码,它不能工作:
df=pd.DataFrame.from_records(top, columns=["word","count"])
df.to_excel("mine" + ".xls")
print(top)它只保存最后一行,而不是整个列表。如果有人能帮上忙,我会很高兴。谢谢!
完整代码为:`
# Python3 program for a word frequency
# counter after crawling a web-page
import requests
from bs4 import BeautifulSoup
import operator
from collections import Counter
import pandas as pd
from datetime import datetime
import time
import pandas as pd
import numpy as np
from itertools import chain
'''Function defining the web-crawler/core
spider, which will fetch information from
a given website, and push the contents to
the second function clean_wordlist()'''
def start(url):
# empty list to store the contents of
# the website fetched from our web-crawler
wordlist = []
source_code = requests.get(url).text
# BeautifulSoup object which will
# ping the requested url for data
soup = BeautifulSoup(source_code, 'html.parser')
# Text in given web-page is stored under
# the <div> tags with class <entry-content>
for each_text in soup.findAll('div', {'class':'entry-content'}):
content = each_text.text
# use split() to break the sentence into
# words and convert them into lowercase
words = content.lower().split()
for each_word in words:
wordlist.append(each_word)
clean_wordlist(wordlist)
# Function removes any unwanted symbols
def clean_wordlist(wordlist):
clean_list =[]
for word in wordlist:
symbols = '!@#$%^&*()_-+={[}]|\;:"<>?/., '
for i in range (0, len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
clean_list.append(word)
create_dictionary(clean_list)
# Creates a dictionary conatining each word's
# count and top_20 ocuuring words
def create_dictionary(clean_list):
word_count = {}
dateObj =time.strftime("%d.%m.%Y")
df=[]
other={}
for word in clean_list:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
c = Counter(word_count)
# returns the most occuring elements
top = c.most_common(100)
#df=pd.DataFrame(chain.from_iterable(top), columns=['Word', 'Count'])
df=pd.DataFrame.from_records([i[0] for i in top])
df.to_excel("mine" + ".xls")
#print(top)
print(top)
# Driver code
if __name__ == '__main__':
start("https://www.geeksforgeeks.org/programming-language-choose/")`发布于 2019-10-09 21:04:14
您的代码中有几个错误。最常见的错误是在循环期间覆盖数据。
就像这里,你一遍又一遍地替换内容!
for each_text in soup.findAll('div', {'class':'entry-content'}):
content = each_text.text在这里,您一遍又一遍地重写输出文件。
for each_word in words:
wordlist.append(each_word)
clean_wordlist(wordlist)我已经在一个函数中重写了它,如果你愿意,你可以把它拆分成更多的函数。
#Add imports...
def process(url):
# the website fetched from our web-crawler
source_code = requests.get(url).text
# BeautifulSoup object which will
# ping the requested url for data
soup = BeautifulSoup(source_code, 'html.parser')
# Text in given web-page is stored under
# the <div> tags with class <entry-content>
entry_content = ''
for content in soup.findAll('div', {'class':'entry-content'}):
entry_content += content.text.strip()
#clean
entry_content = entry_content.lower()
symbols = list('!@#$%^&*()_-+={[}]|\;:"<>?/., ')
symbols.append('\n') #new line char
for symbol in symbols:
entry_content = entry_content.replace(symbol,' ')
#print(entry_content)
#split in words
words = entry_content.split()
#conut words
c = Counter(words)
#remove white spaces
if ' ' in c:
del c[' ']
#write to excell the most common words
most_common_words = c.most_common(100)
df = pd.DataFrame.from_records(most_common_words, columns=['Word', 'Count'])
df.to_excel("mine.xls")
# Driver code
if __name__ == '__main__':
process("https://www.geeksforgeeks.org/programming-language-choose/")https://stackoverflow.com/questions/58287439
复制相似问题