我正在努力加快这个节目的执行速度。唯一可以更改的文件是dictionary.c。
问题基本上是加载字典,然后对提交的文本进行拼写检查。我们使用的方法是假设字典已经被排序(确实如此),然后在trie中创建一个哈希表,然后多次尝试加载该字典。所有这些都是为了使检查函数更快。不巧的是,虽然程序运行,但它的性能没有那么好。您可以在这里看到它相对于其他提交文件的表现:https://speller.cs50.net/cs50/problems/2020/x/challenges/speller,我被列在3004号
speller.c
// Implements a spell-checker
#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include "dictionary.h"
// Undefine any definitions
#undef calculate
#undef getrusage
// Default dictionary
#define DICTIONARY "dictionaries/large"
// Prototype
double calculate(const struct rusage *b, const struct rusage *a);
int main(int argc, char *argv[])
{
// Check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: ./speller [DICTIONARY] text\n");
return 1;
}
// Structures for timing data
struct rusage before, after;
// Benchmarks
double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;
// Determine dictionary to use
char *dictionary = (argc == 3) ? argv[1] : DICTIONARY;
// Load dictionary
getrusage(RUSAGE_SELF, &before);
bool loaded = load(dictionary);
getrusage(RUSAGE_SELF, &after);
// Exit if dictionary not loaded
if (!loaded)
{
printf("Could not load %s.\n", dictionary);
return 1;
}
// Calculate time to load dictionary
time_load = calculate(&before, &after);
// Try to open text
char *text = (argc == 3) ? argv[2] : argv[1];
FILE *file = fopen(text, "r");
if (file == NULL)
{
printf("Could not open %s.\n", text);
unload();
return 1;
}
// Prepare to report misspellings
printf("\nMISSPELLED WORDS\n\n");
// Prepare to spell-check
int index = 0, misspellings = 0, words = 0;
char word[LENGTH + 1];
// Spell-check each word in text
for (int c = fgetc(file); c != EOF; c = fgetc(file))
{
// Allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// Append character to word
word[index] = c;
index++;
// Ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// Consume remainder of alphabetical string
while ((c = fgetc(file)) != EOF && isalpha(c));
// Prepare for new word
index = 0;
}
}
// Ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// Consume remainder of alphanumeric string
while ((c = fgetc(file)) != EOF && isalnum(c));
// Prepare for new word
index = 0;
}
// We must have found a whole word
else if (index > 0)
{
// Terminate current word
word[index] = '\0';
// Update counter
words++;
// Check word's spelling
getrusage(RUSAGE_SELF, &before);
bool misspelled = !check(word);
getrusage(RUSAGE_SELF, &after);
// Update benchmark
time_check += calculate(&before, &after);
// Print word if misspelled
if (misspelled)
{
printf("%s\n", word);
misspellings++;
}
// Prepare for next word
index = 0;
}
}
// Check whether there was an error
if (ferror(file))
{
fclose(file);
printf("Error reading %s.\n", text);
unload();
return 1;
}
// Close text
fclose(file);
// Determine dictionary's size
getrusage(RUSAGE_SELF, &before);
unsigned int n = size();
getrusage(RUSAGE_SELF, &after);
// Calculate time to determine dictionary's size
time_size = calculate(&before, &after);
// Unload dictionary
getrusage(RUSAGE_SELF, &before);
bool unloaded = unload();
getrusage(RUSAGE_SELF, &after);
// Abort if dictionary not unloaded
if (!unloaded)
{
printf("Could not unload %s.\n", dictionary);
return 1;
}
// Calculate time to unload dictionary
time_unload = calculate(&before, &after);
// Report benchmarks
printf("\nWORDS MISSPELLED: %d\n", misspellings);
printf("WORDS IN DICTIONARY: %d\n", n);
printf("WORDS IN TEXT: %d\n", words);
printf("TIME IN load: %.2f\n", time_load);
printf("TIME IN check: %.2f\n", time_check);
printf("TIME IN size: %.2f\n", time_size);
printf("TIME IN unload: %.2f\n", time_unload);
printf("TIME IN TOTAL: %.2f\n\n",
time_load + time_check + time_size + time_unload);
// Success
return 0;
}
// Returns number of seconds between b and a
double calculate(const struct rusage *b, const struct rusage *a)
{
if (b == NULL || a == NULL)
{
return 0.0;
}
else
{
return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
(b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
(b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
/ 1000000.0);
}
}dictionary.h
// Declares a dictionary's functionality
#ifndef DICTIONARY_H
#define DICTIONARY_H
#include <stdbool.h>
// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45
// Prototypes
bool check(const char *word);
unsigned int hash(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);
#endif // DICTIONARY_Hdictionary.c
//实现字典的功能
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include "dictionary.h"
#include <limits.h>
#include <math.h>
#include <ctype.h>
#define MAXWORDS 200000
#define MAXBUCKETS 511
unsigned int numberOfWords = 0;
int target;
unsigned int N = 1; //number of buckets in hash table. (a function of how big the dictionary is)
// we modified nodeWords to containe more information
typedef struct nodeWords
{
char *word;
char *wordEnd;
int index;
struct nodeWords *left;
struct nodeWords *right;
} nodeWords;
nodeWords *headHashNode;
nodeWords **nodeArray;
nodeWords *nodeHash;
nodeWords *nodeTri;
nodeWords **nodeHistory;
int hashBuckets;
int hashFlag = 0;
char (*words)[LENGTH + 1] = NULL;
long int findSize(const char *file_name);
unsigned int size(void);
unsigned int hash(const char *word);
bool check(const char *word);
bool unload(void);
nodeWords *leftRightAddress(int buckets, nodeWords *nodeGen);
int *nextBuckets(int power, int kick, int oob, int buckets);
//used for testing
typedef struct nodeNumbers
{
int number;
struct nodeNumbers *left;
struct nodeNumbers *right;
} nodeNumbers;
// Returns true if word is in dictionary else false
bool check(const char *word)
{
// we created tries for both the hash table and dictionary
int lc = strlen(word);;
char *lcWord = malloc(46);
for (int i = 0; i < lc; i++)
{
lcWord[i] = tolower(word[i]);
}
lcWord[lc] = '\0';
int nodeIdx = hash(lcWord);
if (nodeIdx < 0)
{
free(lcWord);
return false;
}
nodeWords *searchNode;
searchNode = nodeArray[nodeIdx];
bool searchContinues = true, found = false;
// this is our binary searech
do
{
if (strcmp(lcWord, searchNode->word) == 0)
{
searchContinues = false;
found = true;
}
else if (strcmp(lcWord, searchNode->word) < 0)
{
if (searchNode->left == NULL)
{
searchContinues = false;
}
else
{
searchNode = searchNode->left;
}
}
else
{
if (searchNode->right == NULL)
{
searchContinues = false;
}
else
{
searchNode = searchNode->right;
}
}
}
while (searchContinues);
free(lcWord);
return found;
}
unsigned int hash(const char *word)
{
// we assume the dictionary is already sorted
nodeWords *node;
node = headHashNode;
int idx = -9;
int lc = strlen(word);;
char *lcWord = malloc(46);
bool rightFlag = false;
bool leftFag = false;
for (int i = 0; i < lc; i++)
{
lcWord[i] = tolower(word[i]);
}
lcWord[lc] = '\0';
// printf("HASH\n");
// printf("%s %s %s\n",lcWord, words[0],words[numberOfWords - 1]);
if (strcmp(lcWord, words[0]) < 0 || strcmp(lcWord, words[numberOfWords - 1]) > 0)
{
free(lcWord);
return -1;
}
do
{
// printf("%s %s %s %d\n",lcWord, node->word,node->wordEnd,strcmp(lcWord,node->word));
if (strcmp(lcWord, node->word) == 0)
{
//printf("FOUND 0\n");
free(lcWord);
return node->index;
}
else if (strcmp(lcWord, node->word) > 0)
{
if (strcmp(lcWord, node->wordEnd) <= 0 || node->right == NULL)
{
//printf("FOUND 1\n");
free(lcWord);
return node->index;
}
// printf("RIGHT\n");
node = node->right;
}
else
{
// printf("LEFT\n");
if (node->left == NULL)
{
return -1;
}
else
{
node = node->left;
}
}
}
while (true);
}
// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
// messy but does the job
// printf("enter load\n");
//find number of bytes in dictionary file
long int dictionarySize = findSize(dictionary);
if (dictionarySize == -1)
{
//printf("File Not Found 0!\n");
return false;
}
//printf("dictsize %ld\n",dictionarySize);
//allocate memory to read in entire file
char *data = malloc(dictionarySize + 1);
//open dictionary file
int filedesc = open(dictionary, O_RDONLY);
if (filedesc < 0)
{
//printf("File Not Found 1!\n");
return false;
}
//trying to gain load speed
//read entire dictionary
int bytesRead = read(filedesc, data, dictionarySize);
close(filedesc);
//printf("bytes %d\n",bytesRead);
FILE *fp;
fp = fopen("dict.txt", "w+");
int row = 0;
int col = 0;
words = malloc(MAXWORDS * (LENGTH + 1));
//parse dictionary string based on new lines and replace new line char with \0 (null)
for (int i = 0; i < dictionarySize ; i++)
{
words[row][col] = data[i];
col++;
if (data[i] == 0x0a)
{
words[row][col - 1] = '\0';
//printf("%d %s\n", row , words[row]);
col = 0;
row++;
}
}
//printf("col = %d %s\n",col,words[row-1] );
if (col != 0)
{
words[row][col] = '\0';
//printf("extra %d %s\n", row , words[row]);
}
else
{
row--;
}
numberOfWords = row + 1;
fclose(fp);
free(data);
//compute number of hash buckets
target = (int) pow(2, ceil(log2(pow(numberOfWords, .5)))) - 1; //target = number of buckets per tri (except maybe the last one.)
if (target == 0)
{
target = 1;
}
hashBuckets = ceil((float)numberOfWords /
target); // hashBuckets = number of buckets for hash table - each bucket will point to a tri
//printf("words %d target %d hashbuckets %d\n",numberOfWords,target,hashBuckets);
if (hashBuckets > MAXBUCKETS)
{
printf("FATAL ERROR: Too many hash buckets\n");
exit(-1);
}
//allocate space for hash buckets
int cnt = 0;
nodeHash = malloc(hashBuckets * sizeof(*nodeHash));
for (int i = 0; i < numberOfWords ; i = i + target)
{
int endIdx = (i + target - 1 < numberOfWords) ? i + target - 1 : numberOfWords - 1;
nodeHash[cnt].word = malloc((strlen(words[i]) + 1) * sizeof(char));
strcpy(nodeHash[cnt].word, words[i]);
nodeHash[cnt].wordEnd = malloc((strlen(words[endIdx]) + 1) * sizeof(char));
strcpy(nodeHash[cnt].wordEnd, words[endIdx]);
cnt++;
}
headHashNode = leftRightAddress(hashBuckets, nodeHash);
int triCnt;
cnt = 0;
nodeArray = malloc(hashBuckets * sizeof(*nodeHash));
nodeHistory = malloc(hashBuckets * sizeof(*nodeHash));
for (int i = 0; i < numberOfWords; i = i + target)
{
nodeTri = malloc(target * sizeof(*nodeTri));
nodeHistory[cnt] = nodeTri;
triCnt = 0;
for (int j = i, stopper = (i + target < numberOfWords) ? i + target : numberOfWords; j < stopper; j++)
{
nodeTri[triCnt].word = malloc((strlen(words[j]) + 1) * sizeof(char));
strcpy(nodeTri[triCnt].word, words[j]);
int endIdx = (j + target - 1 < numberOfWords) ? j + target - 1 : numberOfWords - 1;
nodeTri[triCnt].wordEnd = malloc((strlen(words[endIdx]) + 1) * sizeof(char));
strcpy(nodeTri[triCnt].wordEnd, words[endIdx]);
triCnt++;
}
nodeArray[cnt] = leftRightAddress(triCnt, nodeTri);
cnt++;
}
return true;
}
// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
return numberOfWords;
}
// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
// quick and easy
for (int i = 0; i < hashBuckets; i++)
{
free(nodeHash[i].word);
free(nodeHash[i].wordEnd);
}
free(nodeHash);// TODO
int cnt = 0;
for (int i = 0; i < hashBuckets; i++)
{
nodeTri = nodeHistory[i];
for (int j = 0; j < target; j++)
{
if (cnt < numberOfWords)
{
free(nodeTri[j].word);
free(nodeTri[j].wordEnd);
}
cnt++;
}
free(nodeTri);
}
free(words);
free(nodeArray);
free(nodeHistory);
return true;
}
long int findSize(const char *file_name)
{
// opening the file in read mode
FILE *fp = fopen(file_name, "r");
// checking if the file exist or not
if (fp == NULL)
{
//printf("File Not Found!\n");
return -1;
}
fseek(fp, 0L, SEEK_END);
// calculating the size of the file
long int size = ftell(fp);
// closing the file
fclose(fp);
return size;
}
nodeWords *leftRightAddress(int buckets, nodeWords *nodeGen)
{
//did not use recursion for speed
bool printFlag = false;
int numbers[buckets] ;
for (int i = 0; i < buckets; i++)
{
numbers[i] = i;
}
int loops = (int)(log2(buckets) + .000001),
cnt = 0,
oob = 0,
rob = 0;
int idx, ilx = -100, irx = -100,
jdx, jlx = -100, jrx = -100,
kdx, klx = -100, krx = -100,
ldx, llx = -100, lrx = -100,
mdx, mlx = -100, mrx = -100,
ndx, nlx = -100, nrx = -100,
odx, olx = -100, orx = -100,
pdx, plx = -100, prx = -100,
qdx, qlx = -100, qrx = -100;
int ikick = 0, ijmp = 0,
jkick = 0, jjmp = 0,
kkick = 0, kjmp = 0,
lkick = 0, ljmp = 0,
mkick = 0, mjmp = 0,
nkick = 0, njmp = 0,
okick = 0, ojmp = 0,
pkick = 0, pjmp = 0,
qkick = 0, qjmp = 0;
int *nextBucket;
bool oddlot = false;
int used[buckets];
if (buckets != (int)pow(2, loops + 1) - 1)
{
for (int i = 0; i < buckets; i++)
{
used[i] = -99;
}
oddlot = true;
}
if (printFlag)
{
printf("loops= %d buckets=%d \n", loops, buckets);
}
for (int i = 0; i < buckets; i++)
{
if (printFlag)
{
printf("%d %s %p \n", i, nodeGen[i].word, &nodeGen[i]);
}
}
//comments
for (int i = 0; i < 1; i++)
{
if (cnt < buckets)
{
idx = pow(2, loops) - 1;
nodeGen[idx].index = idx;
if (printFlag)
{
printf("index = %d idx = %d word = %s cnt = %d \n", nodeGen[idx].index, idx, nodeGen[idx].word, cnt);
}
cnt ++;
if (loops > 0)
{
nextBucket = nextBuckets(loops - 1, jkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d %s\n", nextBucket[0], nextBucket[1], nodeGen[idx].word);
}
nodeGen[idx].left = &nodeGen[nextBucket[0]];
nodeGen[idx].right = &nodeGen[nextBucket[1]];
//printf("VALUES %f %d idx=%d i=%d\n",pow(2,loops)-1,oddlot,idx,i);
if (idx > pow(2, loops) - 1 && oddlot)
{
used[idx] = idx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[idx].left = NULL;
}
}
if (nextBucket[1] <= idx)
{
nodeGen[idx].right = NULL;
}
}
else
{
nodeGen[idx].left = NULL;
nodeGen[idx].right = NULL;
}
if (printFlag)
{
printf("node idx = %p %d left: %p %d right: %p %d oob: %d rob: %d\n\n", &nodeGen[idx], idx, nodeGen[idx].left, ilx,
nodeGen[idx].right, irx, oob, rob);
}
}
//comments
for (int j = 0; j < ((buckets > 1) ? 2 : 0); j++)
{
if (cnt < buckets)
{
jdx = pow(2, loops - 1) - 1 + jkick;
jkick = jkick + pow(2, loops);
if (jdx > buckets - 1 - oob)
{
while (jdx > buckets - 1 - oob)
{
jdx = jdx - 1 ;
}
oob++;
}
nodeGen[jdx].index = jdx;
if (printFlag)
{
printf("index = %d jdx = %d word = %s cnt = %d \n", nodeGen[jdx].index, jdx, nodeGen[jdx].word, cnt) ;
}
cnt ++;
if (loops > 1)
{
nextBucket = nextBuckets(loops - 2, kkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[jdx].left = &nodeGen[nextBucket[0]];
nodeGen[jdx].right = &nodeGen[nextBucket[1]];
if (jdx > pow(2, loops) - 1 && oddlot)
{
used[jdx] = jdx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[jdx].left = NULL;
}
if (nextBucket[1] <= jdx)
{
nodeGen[jdx].right = NULL;
}
}
}
else
{
nodeGen[jdx].left = NULL;
nodeGen[jdx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[jdx], jdx, nodeGen[jdx].left, jlx, nodeGen[jdx].right, jrx);
}
}
//comments
for (int k = 0; k < ((buckets > 3) ? 2 : 0); k++)
{
if (cnt < buckets)
{
kdx = pow(2, loops - 2) - 1 + kkick;
kkick = kkick + pow(2, loops - 1);
if (kdx > buckets - 1 - oob)
{
while (kdx > buckets - 1 - oob)
{
kdx = kdx - 1 ;
}
oob++;
}
nodeGen[kdx].index = kdx;
if (printFlag)
{
printf("index = %d kdx = %d word = %s cnt = %d \n", nodeGen[kdx].index, kdx, nodeGen[kdx].word, cnt) ;
}
cnt ++;
if (loops > 2)
{
nextBucket = nextBuckets(loops - 3, lkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[kdx].left = &nodeGen[nextBucket[0]];
nodeGen[kdx].right = &nodeGen[nextBucket[1]];
if (kdx > pow(2, loops) - 1 && oddlot)
{
used[kdx] = kdx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[kdx].left = NULL;
}
if (nextBucket[1] <= kdx)
{
nodeGen[kdx].right = NULL;
}
}
}
else
{
nodeGen[kdx].left = NULL;
nodeGen[kdx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[kdx], kdx, nodeGen[kdx].left, klx, nodeGen[kdx].right, krx);
}
}
//comments
for (int l = 0; l < ((buckets > 7) ? 2 : 0); l++)
{
if (cnt < buckets)
{
ldx = pow(2, loops - 3) - 1 + lkick;
lkick = lkick + pow(2, loops - 2);
if (ldx > buckets - 1 - oob)
{
while (ldx > buckets - 1 - oob)
{
ldx = ldx - 1;
}
oob++;
}
nodeGen[ldx].index = ldx;
if (printFlag)
{
printf("index = %d ldx = %d word = %s cnt = %d \n", nodeGen[ldx].index, ldx, nodeGen[ldx].word, cnt) ;
}
cnt ++;
if (loops > 3)
{
nextBucket = nextBuckets(loops - 4, mkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[ldx].left = &nodeGen[nextBucket[0]];
nodeGen[ldx].right = &nodeGen[nextBucket[1]];
if (ldx > pow(2, loops) - 1 && oddlot)
{
used[ldx] = ldx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[ldx].left = NULL;
}
if (nextBucket[1] <= ldx)
{
nodeGen[ldx].right = NULL;
}
}
}
else
{
nodeGen[ldx].left = NULL;
nodeGen[ldx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[ldx], ldx, nodeGen[ldx].left, llx, nodeGen[ldx].right, lrx);
}
}
//comments
for (int m = 0; m < ((buckets > 15) ? 2 : 0); m++)
{
if (cnt < buckets)
{
mdx = pow(2, loops - 4) - 1 + mkick;
mkick = mkick + pow(2, loops - 3);
if (mdx > buckets - 1 - oob)
{
while (mdx > buckets - 1 - oob)
{
mdx = mdx - 1;
}
oob++;
}
nodeGen[mdx].index = mdx;
if (printFlag)
{
printf("index = %d mdx = %d word = %s cnt = %d \n", nodeGen[mdx].index, mdx, nodeGen[mdx].word, cnt) ;
}
cnt ++;
if (loops > 4)
{
nextBucket = nextBuckets(loops - 5, nkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[mdx].left = &nodeGen[nextBucket[0]];
nodeGen[mdx].right = &nodeGen[nextBucket[1]];
if (mdx > pow(2, loops) - 1 && oddlot)
{
used[mdx] = mdx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[mdx].left = NULL;
}
if (nextBucket[1] <= mdx)
{
nodeGen[mdx].right = NULL;
}
}
}
else
{
nodeGen[mdx].left = NULL;
nodeGen[mdx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d oob: %d rob: %d\n\n", &nodeGen[mdx], mdx, nodeGen[mdx].left, mlx,
nodeGen[mdx].right, mrx, oob, rob);
}
}
//comments
for (int n = 0; n < ((buckets > 31) ? 2 : 0); n++)
{
if (cnt < buckets)
{
ndx = pow(2, loops - 5) - 1 + nkick;
nkick = nkick + pow(2, loops - 4);
if (ndx > buckets - 1 - oob)
{
while (ndx > buckets - 1 - oob)
{
ndx = ndx - 1;
}
oob++;
}
nodeGen[ndx].index = ndx;
if (printFlag)
{
printf("index = %d ndx = %d word = %s cnt = %d \n", nodeGen[ndx].index, ndx, nodeGen[ndx].word, cnt) ;
}
cnt ++;
if (loops > 5)
{
nextBucket = nextBuckets(loops - 6, okick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[ndx].left = &nodeGen[nextBucket[0]];
nodeGen[ndx].right = &nodeGen[nextBucket[1]];
if (ndx > pow(2, loops) - 1 && oddlot)
{
used[ndx] = ndx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[ndx].left = NULL;
}
if (nextBucket[1] <= ndx)
{
nodeGen[ndx].right = NULL;
}
}
}
else
{
nodeGen[ndx].left = NULL;
nodeGen[ndx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[ndx], ndx, nodeGen[ndx].left, nlx, nodeGen[ndx].right, nrx);
}
}
//comments
for (int o = 0; o < ((buckets > 63) ? 2 : 0); o++)
{
if (cnt < buckets)
{
odx = pow(2, loops - 6) - 1 + okick;
okick = okick + pow(2, loops - 5);
if (odx > buckets - 1 - oob)
{
while (odx > buckets - 1 - oob)
{
odx = odx - 1;
}
oob++;
}
nodeGen[odx].index = odx;
if (printFlag)
{
printf("index = %d odx = %d word = %s cnt = %d \n", nodeGen[odx].index, odx, nodeGen[odx].word, cnt) ;
}
cnt ++;
if (loops > 6)
{
nextBucket = nextBuckets(loops - 7, pkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[odx].left = &nodeGen[nextBucket[0]];
nodeGen[odx].right = &nodeGen[nextBucket[1]];
if (odx > pow(2, loops) - 1 && oddlot)
{
used[odx] = odx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[odx].left = NULL;
}
if (nextBucket[1] <= odx)
{
nodeGen[odx].right = NULL;
}
}
}
else
{
nodeGen[odx].left = NULL;
nodeGen[odx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[odx], odx, nodeGen[odx].left, olx, nodeGen[odx].right, orx);
}
}
//comments
for (int p = 0; p < ((buckets > 127) ? 2 : 0); p++)
{
if (cnt < buckets)
{
pdx = pow(2, loops - 7) - 1 + pkick;
pkick = pkick + pow(2, loops - 6);
if (pdx > buckets - 1 - oob)
{
while (pdx > buckets - 1 - oob)
{
pdx = pdx - 1;
}
oob++;
}
nodeGen[pdx].index = pdx;
if (printFlag)
{
printf("index = %d pdx = %d word = %s cnt = %d \n", nodeGen[pdx].index, pdx, nodeGen[pdx].word, cnt) ;
}
cnt ++;
if (loops > 7)
{
nextBucket = nextBuckets(loops - 8, qkick, oob, buckets);
if (printFlag)
{
printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
}
nodeGen[pdx].left = &nodeGen[nextBucket[0]];
nodeGen[pdx].right = &nodeGen[nextBucket[1]];
if (pdx > pow(2, loops) - 1 && oddlot)
{
used[pdx] = pdx;
if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
{
nodeGen[pdx].left = NULL;
}
if (nextBucket[1] <= pdx)
{
nodeGen[pdx].right = NULL;
}
}
}
else
{
nodeGen[pdx].left = NULL;
nodeGen[pdx].right = NULL;
}
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[pdx], pdx, nodeGen[pdx].left, plx, nodeGen[pdx].right, prx);
}
}
//comments
for (int q = 0; q < ((buckets > 255) ? 2 : 0); q++)
{
if (cnt < buckets)
{
qdx = pow(2, loops - 8) - 1 + qkick;
qkick = qkick + pow(2, loops - 7);
if (qdx > buckets - 1 - oob)
{
while (qdx > buckets - 1 - oob)
{
qdx = qdx - 1;
}
oob++;
}
nodeGen[qdx].index = qdx;
if (printFlag)
{
printf("index = %d qdx = %d word = %s cnt = %d \n", nodeGen[qdx].index, qdx, nodeGen[qdx].word, cnt) ;
}
cnt ++;
nodeGen[qdx].left = NULL;
nodeGen[qdx].right = NULL;
if (printFlag)
{
printf("node = %p %d left: %p %d right: %p %d\n\n", &nodeGen[qdx], qdx, nodeGen[qdx].left, qlx, nodeGen[qdx].right, qrx);
}
}
}
}
}
}
}
}
}
}
}
return (&nodeGen[idx]);
}
int *nextBuckets(int power, int kick, int oob, int buckets)
{
//comments
static int next[2];
for (int i = 0; i < 2; i++)
{
next[i] = pow(2, power) - 1 + kick;
kick = kick + pow(2, power + 1);
if (next[i] > buckets - 1 - oob)
{
while (next[i] > buckets - 1 - oob)
{
next[i] = next[i] - 1 ;
}
oob++;
}
}
return next;
}发布于 2020-10-02 22:34:53
一次读取一个文件一个字符是很慢的。即使标准输入被缓冲,您仍然有单独调用fgetc()的开销,每次都必须检查输入缓冲区中是否至少有一个字符,如果是这样,则从缓冲区中删除它并返回它,否则在缓冲区中读取更多的数据。
处理该文件的最快方法可能是mmap()它,尽管这不是可移植的C。相反,可以考虑使用fgets()读取整行,或者每次使用fread()读取几千字节。然后循环遍历刚才读取的字符行/数组。
请注意,如果您以整行方式阅读,并且您知道没有连字符,那么您始终知道您已经读取了所有单词,因此您不再需要将字符复制到word[]中,您只需要跟踪单词的开始和结束。
中不包括基准代码
请注意,对getrusage()的调用本身可能会很昂贵,所以不要在提交的内容中包括这一点。实际上,我根本不会调用getrusage(),而是使用time或Linux的perf这样的外部工具来测量程序使用的时间或CPU周期。
如果可能的话,
在构建字典时,首先分配内存在文件中读取,然后为一个单词数组分配内存,然后为散列桶分配内存,然后为每个桶为存储在其中的单词分配一些内存。基本上,记忆中的每个单词都有三个副本。这是在构建trie之前,trie每个字都有大量的分配。
这是对内存的巨大浪费,复制所有数据也不是免费的。如果字典比输入文本大,那也是一种巨大的浪费。因此,尽可能地删除多余的副本和分配。
我还看到您在malloc()中调用hash(),只是为了一个小的临时缓冲区。在本例中,只需使用一个简单的数组:
char lcWord[LENGTH + 1];的更快方法
问题规范提到,字典列出了已按字典排序的所有单词。除非要拼写检查的文本比字典有更多的单词,否则我甚至不会麻烦地对它进行散列或创建一棵树,而只是使用mmap(),并使用二进制搜索从字典中的输入中查找单词。但是让我们假设我们只需要使用标准的C函数,并且您确实希望构建一个trie。
首先,像您已经在内存中所做的那样加载字典。请注意,现在您已经将字典中的所有单词都存储在内存中,因此您不再需要复制它们,而只需在缓冲区中创建指针即可。您可以用NUL字节替换所有换行符,这样所有的单词都是有效的C字符串。
在构建trie时,可以使用字典已排序的事实。
我很难理解你是如何编制词典索引的。hash()函数看起来一点也不像散列函数,而是进行二进制搜索。散列函数的目的是避免任何类型的树遍历。在您的hash()返回之后,您将在check()中执行另一个二进制搜索。我想这会让你两败俱伤。
问题陈述清楚地指出,您应该实现“使用哈希表”的拼写检查器。因此,我将完全避免使用trie,只需专注于编写一个好的哈希表实现。如果您真的想使用trie,那么就不要使用hash(),只需将字典编码为一个trie即可。
https://codereview.stackexchange.com/questions/250120
复制相似问题