文章/答案/技术大牛

发布

社区首页 >问答首页 >拼写检查挑战-实现字典查找

问拼写检查挑战-实现字典查找
EN

Code Review用户

提问于 2020-10-02 17:02:02

回答 1查看 63关注 0票数 3

我正在努力加快这个节目的执行速度。唯一可以更改的文件是dictionary.c。

问题基本上是加载字典，然后对提交的文本进行拼写检查。我们使用的方法是假设字典已经被排序(确实如此)，然后在trie中创建一个哈希表，然后多次尝试加载该字典。所有这些都是为了使检查函数更快。不巧的是，虽然程序运行，但它的性能没有那么好。您可以在这里看到它相对于其他提交文件的表现：https://speller.cs50.net/cs50/problems/2020/x/challenges/speller，我被列在3004号

speller.c

// Implements a spell-checker

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>

#include "dictionary.h"

// Undefine any definitions
#undef calculate
#undef getrusage

// Default dictionary
#define DICTIONARY "dictionaries/large"

// Prototype
double calculate(const struct rusage *b, const struct rusage *a);

int main(int argc, char *argv[])
{
    // Check for correct number of args
    if (argc != 2 && argc != 3)
    {
        printf("Usage: ./speller [DICTIONARY] text\n");
        return 1;
    }

    // Structures for timing data
    struct rusage before, after;

    // Benchmarks
    double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

    // Determine dictionary to use
    char *dictionary = (argc == 3) ? argv[1] : DICTIONARY;

    // Load dictionary
    getrusage(RUSAGE_SELF, &before);
    bool loaded = load(dictionary);
    getrusage(RUSAGE_SELF, &after);

    // Exit if dictionary not loaded
    if (!loaded)
    {
        printf("Could not load %s.\n", dictionary);
        return 1;
    }

    // Calculate time to load dictionary
    time_load = calculate(&before, &after);

    // Try to open text
    char *text = (argc == 3) ? argv[2] : argv[1];
    FILE *file = fopen(text, "r");
    if (file == NULL)
    {
        printf("Could not open %s.\n", text);
        unload();
        return 1;
    }

    // Prepare to report misspellings
    printf("\nMISSPELLED WORDS\n\n");

    // Prepare to spell-check
    int index = 0, misspellings = 0, words = 0;
    char word[LENGTH + 1];

    // Spell-check each word in text
    for (int c = fgetc(file); c != EOF; c = fgetc(file))
    {
        // Allow only alphabetical characters and apostrophes
        if (isalpha(c) || (c == '\'' && index > 0))
        {
            // Append character to word
            word[index] = c;
            index++;

            // Ignore alphabetical strings too long to be words
            if (index > LENGTH)
            {
                // Consume remainder of alphabetical string
                while ((c = fgetc(file)) != EOF && isalpha(c));

                // Prepare for new word
                index = 0;
            }
        }

        // Ignore words with numbers (like MS Word can)
        else if (isdigit(c))
        {
            // Consume remainder of alphanumeric string
            while ((c = fgetc(file)) != EOF && isalnum(c));

            // Prepare for new word
            index = 0;
        }

        // We must have found a whole word
        else if (index > 0)
        {
            // Terminate current word
            word[index] = '\0';

            // Update counter
            words++;

            // Check word's spelling
            getrusage(RUSAGE_SELF, &before);

            bool misspelled = !check(word);

            getrusage(RUSAGE_SELF, &after);

            // Update benchmark
            time_check += calculate(&before, &after);

            // Print word if misspelled
            if (misspelled)
            {
                printf("%s\n", word);
                misspellings++;
            }

            // Prepare for next word
            index = 0;
        }
    }

    // Check whether there was an error
    if (ferror(file))
    {
        fclose(file);
        printf("Error reading %s.\n", text);
        unload();
        return 1;
    }

    // Close text
    fclose(file);

    // Determine dictionary's size
    getrusage(RUSAGE_SELF, &before);
    unsigned int n = size();
    getrusage(RUSAGE_SELF, &after);

    // Calculate time to determine dictionary's size
    time_size = calculate(&before, &after);

    // Unload dictionary
    getrusage(RUSAGE_SELF, &before);
    bool unloaded = unload();
    getrusage(RUSAGE_SELF, &after);

    // Abort if dictionary not unloaded
    if (!unloaded)
    {
        printf("Could not unload %s.\n", dictionary);
        return 1;
    }

    // Calculate time to unload dictionary
    time_unload = calculate(&before, &after);

    // Report benchmarks
    printf("\nWORDS MISSPELLED:     %d\n", misspellings);
    printf("WORDS IN DICTIONARY:  %d\n", n);
    printf("WORDS IN TEXT:        %d\n", words);
    printf("TIME IN load:         %.2f\n", time_load);
    printf("TIME IN check:        %.2f\n", time_check);
    printf("TIME IN size:         %.2f\n", time_size);
    printf("TIME IN unload:       %.2f\n", time_unload);
    printf("TIME IN TOTAL:        %.2f\n\n",
           time_load + time_check + time_size + time_unload);

    // Success
    return 0;
}

// Returns number of seconds between b and a
double calculate(const struct rusage *b, const struct rusage *a)
{
    if (b == NULL || a == NULL)
    {
        return 0.0;
    }
    else
    {
        return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
                  (b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
                 ((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
                  (b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
                / 1000000.0);
    }
}

dictionary.h

// Declares a dictionary's functionality

#ifndef DICTIONARY_H
#define DICTIONARY_H

#include <stdbool.h>

// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45

// Prototypes
bool check(const char *word);
unsigned int hash(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);

#endif // DICTIONARY_H

dictionary.c

//实现字典的功能

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include "dictionary.h"
#include <limits.h>
#include <math.h>
#include <ctype.h>

#define MAXWORDS 200000
#define MAXBUCKETS 511

unsigned int numberOfWords = 0;
int target;
unsigned int N = 1;  //number of buckets in hash table. (a function of how big the dictionary is)


// we modified nodeWords to containe more information
typedef struct nodeWords
{
    char  *word;
    char  *wordEnd;
    int    index;
    struct nodeWords *left;
    struct nodeWords *right;
} nodeWords;

nodeWords  *headHashNode;
nodeWords **nodeArray;
nodeWords *nodeHash;
nodeWords *nodeTri;
nodeWords **nodeHistory;

int hashBuckets;

int hashFlag = 0;

char (*words)[LENGTH + 1] = NULL;


long int findSize(const char *file_name);
unsigned int size(void);
unsigned int hash(const char *word);
bool check(const char *word);
bool unload(void);

nodeWords *leftRightAddress(int buckets, nodeWords *nodeGen);
int *nextBuckets(int power, int kick, int oob, int buckets);

//used for testing
typedef struct nodeNumbers
{
    int number;
    struct nodeNumbers *left;
    struct nodeNumbers *right;
} nodeNumbers;


// Returns true if word is in dictionary else false
bool check(const char *word)
{

    // we created tries for both the hash table and dictionary

    int lc = strlen(word);;
    char  *lcWord = malloc(46);


    for (int i = 0; i < lc; i++)
    {

        lcWord[i] = tolower(word[i]);
    }
    lcWord[lc] = '\0';


    int nodeIdx = hash(lcWord);
    if (nodeIdx < 0)
    {
        free(lcWord);
        return false;
    }

    nodeWords *searchNode;
    searchNode = nodeArray[nodeIdx];
    bool searchContinues = true, found = false;

    // this is our binary searech

    do
    {
        if (strcmp(lcWord, searchNode->word) == 0)
        {
            searchContinues = false;
            found = true;
        }
        else if (strcmp(lcWord, searchNode->word) < 0)
        {
            if (searchNode->left == NULL)
            {
                searchContinues = false;
            }
            else
            {
                searchNode = searchNode->left;
            }
        }
        else
        {
            if (searchNode->right == NULL)
            {
                searchContinues = false;
            }
            else
            {
                searchNode = searchNode->right;
            }
        }
    }
    while (searchContinues);

    free(lcWord);
    return found;
}
unsigned int hash(const char *word)
{

    // we assume the dictionary is already sorted

    nodeWords *node;
    node = headHashNode;
    int idx = -9;
    int lc = strlen(word);;
    char *lcWord = malloc(46);
    bool rightFlag = false;
    bool leftFag = false;

    for (int i = 0; i < lc; i++)
    {

        lcWord[i] = tolower(word[i]);
    }
    lcWord[lc] = '\0';

    // printf("HASH\n");
    // printf("%s %s %s\n",lcWord, words[0],words[numberOfWords - 1]);

    if (strcmp(lcWord, words[0]) < 0 || strcmp(lcWord, words[numberOfWords - 1]) > 0)
    {
        free(lcWord);
        return -1;
    }
    do
    {
        // printf("%s %s %s %d\n",lcWord, node->word,node->wordEnd,strcmp(lcWord,node->word));

        if (strcmp(lcWord, node->word) == 0)
        {
            //printf("FOUND 0\n");
            free(lcWord);
            return node->index;

        }
        else if (strcmp(lcWord, node->word) > 0)
        {
            if (strcmp(lcWord, node->wordEnd) <= 0 || node->right == NULL)
            {
                //printf("FOUND 1\n");
                free(lcWord);
                return node->index;
            }
            // printf("RIGHT\n");


            node = node->right;



        }
        else
        {
            // printf("LEFT\n");
            if (node->left == NULL)
            {
                return -1;
            }
            else
            {
                node = node->left;
            }
        }


    }
    while (true);
}




// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
    // messy but does the job

    // printf("enter load\n");
    //find number of bytes in dictionary file
    long int dictionarySize = findSize(dictionary);
    if (dictionarySize == -1)
    {
        //printf("File Not Found 0!\n");
        return false;
    }

    //printf("dictsize %ld\n",dictionarySize);

    //allocate memory to read in entire file
    char *data = malloc(dictionarySize + 1);

    //open dictionary file
    int filedesc = open(dictionary, O_RDONLY);
    if (filedesc < 0)
    {
        //printf("File Not Found 1!\n");
        return false;
    }
    //trying to gain load speed

    //read entire dictionary
    int bytesRead = read(filedesc, data, dictionarySize);
    close(filedesc);
    //printf("bytes %d\n",bytesRead);

    FILE *fp;
    fp = fopen("dict.txt", "w+");

    int row = 0;
    int col = 0;
    words = malloc(MAXWORDS * (LENGTH + 1));

    //parse dictionary string based on new lines and replace new line char with \0 (null)
    for (int i = 0; i < dictionarySize ; i++)
    {
        words[row][col] = data[i];
        col++;
        if (data[i] == 0x0a)
        {
            words[row][col - 1] = '\0';
            //printf("%d  %s\n", row , words[row]);
            col = 0;
            row++;
        }

    }

    //printf("col = %d %s\n",col,words[row-1] );

    if (col != 0)
    {
        words[row][col] = '\0';
        //printf("extra %d  %s\n", row , words[row]);
    }
    else
    {
        row--;
    }


    numberOfWords = row + 1;
    fclose(fp);
    free(data);

    //compute number of hash buckets
    target = (int) pow(2, ceil(log2(pow(numberOfWords, .5)))) - 1; //target = number of buckets per tri (except maybe the last one.)
    if (target == 0)
    {
        target = 1;

    }


    hashBuckets = ceil((float)numberOfWords /
                       target); //   hashBuckets = number of buckets for hash table - each bucket will point to a tri

    //printf("words %d target %d hashbuckets %d\n",numberOfWords,target,hashBuckets);

    if (hashBuckets > MAXBUCKETS)
    {
        printf("FATAL ERROR: Too many hash buckets\n");
        exit(-1);
    }
    //allocate space for hash buckets
    int cnt = 0;
    nodeHash = malloc(hashBuckets * sizeof(*nodeHash));

    for (int i =  0; i < numberOfWords ; i = i + target)
    {
        int endIdx = (i + target - 1 < numberOfWords) ? i + target - 1 : numberOfWords - 1;

        nodeHash[cnt].word = malloc((strlen(words[i]) + 1) * sizeof(char));
        strcpy(nodeHash[cnt].word, words[i]);

        nodeHash[cnt].wordEnd = malloc((strlen(words[endIdx]) + 1) * sizeof(char));
        strcpy(nodeHash[cnt].wordEnd, words[endIdx]);
        cnt++;
    }


    headHashNode = leftRightAddress(hashBuckets, nodeHash);


    int triCnt;
    cnt = 0;

    nodeArray = malloc(hashBuckets * sizeof(*nodeHash));

    nodeHistory = malloc(hashBuckets * sizeof(*nodeHash));

    for (int i = 0; i < numberOfWords; i = i + target)
    {
        nodeTri = malloc(target * sizeof(*nodeTri));
        nodeHistory[cnt] = nodeTri;


        triCnt = 0;
        for (int j = i, stopper = (i + target < numberOfWords) ? i + target : numberOfWords; j < stopper; j++)
        {
            nodeTri[triCnt].word = malloc((strlen(words[j]) + 1) * sizeof(char));
            strcpy(nodeTri[triCnt].word, words[j]);

            int endIdx = (j + target - 1 < numberOfWords) ? j + target - 1 : numberOfWords - 1;
            nodeTri[triCnt].wordEnd = malloc((strlen(words[endIdx]) + 1) * sizeof(char));
            strcpy(nodeTri[triCnt].wordEnd, words[endIdx]);

            triCnt++;
        }

        nodeArray[cnt] = leftRightAddress(triCnt, nodeTri);

        cnt++;


    }




    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
    return numberOfWords;
}

// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
    // quick and easy

    for (int i = 0; i < hashBuckets; i++)
    {
        free(nodeHash[i].word);
        free(nodeHash[i].wordEnd);
    }
    free(nodeHash);// TODO

    int cnt = 0;

    for (int i = 0; i < hashBuckets; i++)
    {
        nodeTri = nodeHistory[i];
        for (int j = 0; j < target; j++)
        {
            if (cnt < numberOfWords)
            {
                free(nodeTri[j].word);
                free(nodeTri[j].wordEnd);
            }
            cnt++;

        }
        free(nodeTri);
    }

    free(words);

    free(nodeArray);
    free(nodeHistory);

    return true;
}
long int findSize(const char *file_name)
{


    // opening the file in read mode
    FILE *fp = fopen(file_name, "r");

    // checking if the file exist or not
    if (fp == NULL)
    {
        //printf("File Not Found!\n");
        return -1;
    }

    fseek(fp, 0L, SEEK_END);

    // calculating the size of the file
    long int size = ftell(fp);

    // closing the file
    fclose(fp);

    return size;
}

nodeWords *leftRightAddress(int buckets, nodeWords *nodeGen)
{
    //did not use recursion for speed

    bool printFlag = false;

    int numbers[buckets] ;

    for (int i = 0; i < buckets; i++)
    {
        numbers[i] = i;
    }

    int loops = (int)(log2(buckets) + .000001),
        cnt   = 0,
        oob   = 0,
        rob   = 0;

    int idx, ilx = -100, irx = -100,
             jdx, jlx = -100, jrx = -100,
                  kdx, klx = -100, krx = -100,
                       ldx, llx = -100, lrx = -100,
                            mdx, mlx = -100, mrx = -100,
                                 ndx, nlx = -100, nrx = -100,
                                      odx, olx = -100, orx = -100,
                                           pdx, plx = -100, prx = -100,
                                                qdx, qlx = -100, qrx = -100;

    int ikick = 0, ijmp = 0,
        jkick = 0, jjmp = 0,
        kkick = 0, kjmp = 0,
        lkick = 0, ljmp = 0,
        mkick = 0, mjmp = 0,
        nkick = 0, njmp = 0,
        okick = 0, ojmp = 0,
        pkick = 0, pjmp = 0,
        qkick = 0, qjmp = 0;

    int *nextBucket;
    bool oddlot = false;
    int used[buckets];

    if (buckets != (int)pow(2, loops + 1) - 1)
    {
        for (int i = 0; i < buckets; i++)
        {
            used[i] = -99;
        }
        oddlot = true;
    }


    if (printFlag)
    {
        printf("loops= %d buckets=%d  \n", loops, buckets);
    }

    for (int i = 0; i < buckets; i++)
    {
        if (printFlag)
        {
            printf("%d   %s   %p  \n", i, nodeGen[i].word, &nodeGen[i]);
        }
    }
    //comments

    for (int i = 0; i < 1; i++)
    {
        if (cnt < buckets)
        {
            idx = pow(2, loops) - 1;
            nodeGen[idx].index = idx;

            if (printFlag)
            {
                printf("index = %d  idx = %d word = %s cnt = %d \n", nodeGen[idx].index, idx, nodeGen[idx].word, cnt);
            }
            cnt ++;

            if (loops > 0)
            {
                nextBucket = nextBuckets(loops - 1, jkick, oob,  buckets);
                if (printFlag)
                {
                    printf("NEXT %d %d %s\n", nextBucket[0], nextBucket[1], nodeGen[idx].word);
                }

                nodeGen[idx].left  = &nodeGen[nextBucket[0]];
                nodeGen[idx].right = &nodeGen[nextBucket[1]];

                //printf("VALUES %f %d idx=%d i=%d\n",pow(2,loops)-1,oddlot,idx,i);

                if (idx > pow(2, loops) - 1 && oddlot)
                {
                    used[idx] = idx;
                    if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                    {
                        nodeGen[idx].left = NULL;
                    }
                }
                if (nextBucket[1] <= idx)
                {
                    nodeGen[idx].right = NULL;
                }
            }
            else
            {
                nodeGen[idx].left  = NULL;
                nodeGen[idx].right = NULL;
            }
            if (printFlag)
            {
                printf("node idx = %p %d  left: %p  %d  right: %p  %d  oob: %d  rob:  %d\n\n", &nodeGen[idx], idx, nodeGen[idx].left, ilx,
                       nodeGen[idx].right, irx, oob, rob);
            }
        }
        //comments
        for (int j = 0; j < ((buckets > 1) ? 2 : 0); j++)
        {
            if (cnt < buckets)
            {
                jdx   = pow(2, loops - 1) - 1  + jkick;
                jkick = jkick + pow(2, loops);
                if (jdx > buckets - 1 - oob)
                {
                    while (jdx > buckets - 1 - oob)
                    {
                        jdx = jdx -  1 ;
                    }
                    oob++;
                }
                nodeGen[jdx].index = jdx;
                if (printFlag)
                {
                    printf("index = %d  jdx = %d word = %s cnt = %d \n", nodeGen[jdx].index, jdx, nodeGen[jdx].word, cnt) ;
                }
                cnt ++;

                if (loops > 1)
                {
                    nextBucket = nextBuckets(loops - 2, kkick, oob,  buckets);
                    if (printFlag)
                    {
                        printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                    }

                    nodeGen[jdx].left  = &nodeGen[nextBucket[0]];
                    nodeGen[jdx].right = &nodeGen[nextBucket[1]];

                    if (jdx > pow(2, loops) - 1  && oddlot)
                    {
                        used[jdx] = jdx;
                        if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                        {
                            nodeGen[jdx].left = NULL;
                        }
                        if (nextBucket[1] <= jdx)
                        {
                            nodeGen[jdx].right = NULL;
                        }
                    }
                }
                else
                {
                    nodeGen[jdx].left  = NULL;
                    nodeGen[jdx].right = NULL;
                }
                if (printFlag)
                {
                    printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[jdx], jdx, nodeGen[jdx].left, jlx, nodeGen[jdx].right, jrx);
                }
            }
            //comments
            for (int k = 0; k < ((buckets > 3) ? 2 : 0); k++)
            {
                if (cnt < buckets)
                {
                    kdx   = pow(2, loops - 2) - 1  + kkick;
                    kkick = kkick + pow(2, loops - 1);
                    if (kdx > buckets - 1 - oob)
                    {
                        while (kdx > buckets - 1 - oob)
                        {
                            kdx = kdx -  1 ;
                        }
                        oob++;
                    }
                    nodeGen[kdx].index = kdx;
                    if (printFlag)
                    {
                        printf("index = %d  kdx = %d word = %s cnt = %d \n", nodeGen[kdx].index, kdx, nodeGen[kdx].word, cnt) ;
                    }
                    cnt ++;

                    if (loops > 2)
                    {
                        nextBucket = nextBuckets(loops - 3, lkick, oob,  buckets);
                        if (printFlag)
                        {
                            printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                        }

                        nodeGen[kdx].left  = &nodeGen[nextBucket[0]];
                        nodeGen[kdx].right = &nodeGen[nextBucket[1]];

                        if (kdx > pow(2, loops) - 1  && oddlot)
                        {
                            used[kdx] = kdx;
                            if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                            {
                                nodeGen[kdx].left = NULL;
                            }
                            if (nextBucket[1] <= kdx)
                            {
                                nodeGen[kdx].right = NULL;
                            }
                        }
                    }
                    else
                    {
                        nodeGen[kdx].left  = NULL;
                        nodeGen[kdx].right = NULL;
                    }
                    if (printFlag)
                    {
                        printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[kdx], kdx, nodeGen[kdx].left, klx, nodeGen[kdx].right, krx);
                    }
                }
                //comments
                for (int l = 0; l < ((buckets > 7) ? 2 : 0); l++)
                {
                    if (cnt < buckets)
                    {
                        ldx   = pow(2, loops - 3) - 1  + lkick;
                        lkick = lkick + pow(2, loops - 2);
                        if (ldx > buckets - 1 - oob)
                        {
                            while (ldx > buckets - 1 - oob)
                            {
                                ldx = ldx - 1;
                            }
                            oob++;
                        }
                        nodeGen[ldx].index = ldx;
                        if (printFlag)
                        {
                            printf("index = %d  ldx = %d word = %s cnt = %d \n", nodeGen[ldx].index, ldx, nodeGen[ldx].word, cnt) ;
                        }
                        cnt ++;

                        if (loops > 3)
                        {
                            nextBucket = nextBuckets(loops - 4, mkick, oob,  buckets);
                            if (printFlag)
                            {
                                printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                            }

                            nodeGen[ldx].left  = &nodeGen[nextBucket[0]];
                            nodeGen[ldx].right = &nodeGen[nextBucket[1]];

                            if (ldx > pow(2, loops) - 1  && oddlot)
                            {
                                used[ldx] = ldx;
                                if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                {
                                    nodeGen[ldx].left = NULL;
                                }
                                if (nextBucket[1] <= ldx)
                                {
                                    nodeGen[ldx].right = NULL;
                                }
                            }
                        }
                        else
                        {
                            nodeGen[ldx].left  = NULL;
                            nodeGen[ldx].right = NULL;
                        }
                        if (printFlag)
                        {
                            printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[ldx], ldx, nodeGen[ldx].left, llx, nodeGen[ldx].right, lrx);
                        }
                    }
                    //comments
                    for (int m = 0; m < ((buckets > 15) ? 2 : 0); m++)
                    {
                        if (cnt < buckets)
                        {
                            mdx   = pow(2, loops - 4) - 1  + mkick;
                            mkick = mkick + pow(2, loops - 3);
                            if (mdx > buckets - 1 - oob)
                            {
                                while (mdx > buckets - 1 - oob)
                                {
                                    mdx = mdx - 1;
                                }
                                oob++;
                            }
                            nodeGen[mdx].index = mdx;
                            if (printFlag)
                            {
                                printf("index = %d  mdx = %d word = %s cnt = %d \n", nodeGen[mdx].index, mdx, nodeGen[mdx].word, cnt) ;
                            }
                            cnt ++;

                            if (loops > 4)
                            {
                                nextBucket = nextBuckets(loops - 5, nkick, oob,  buckets);
                                if (printFlag)
                                {
                                    printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                }

                                nodeGen[mdx].left  = &nodeGen[nextBucket[0]];
                                nodeGen[mdx].right = &nodeGen[nextBucket[1]];

                                if (mdx > pow(2, loops) - 1  && oddlot)
                                {
                                    used[mdx] = mdx;
                                    if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                    {
                                        nodeGen[mdx].left = NULL;
                                    }
                                    if (nextBucket[1] <= mdx)
                                    {
                                        nodeGen[mdx].right = NULL;
                                    }
                                }
                            }
                            else
                            {
                                nodeGen[mdx].left  = NULL;
                                nodeGen[mdx].right = NULL;
                            }
                            if (printFlag)
                            {
                                printf("node = %p %d  left: %p  %d  right: %p  %d oob: %d  rob: %d\n\n", &nodeGen[mdx], mdx, nodeGen[mdx].left, mlx,
                                       nodeGen[mdx].right, mrx, oob, rob);
                            }
                        }
                        //comments
                        for (int n = 0; n < ((buckets > 31) ? 2 : 0); n++)
                        {
                            if (cnt < buckets)
                            {
                                ndx   = pow(2, loops - 5) - 1  + nkick;
                                nkick = nkick + pow(2, loops - 4);
                                if (ndx > buckets - 1 - oob)
                                {
                                    while (ndx > buckets - 1 - oob)
                                    {
                                        ndx = ndx - 1;
                                    }
                                    oob++;
                                }
                                nodeGen[ndx].index = ndx;
                                if (printFlag)
                                {
                                    printf("index = %d  ndx = %d word = %s cnt = %d \n", nodeGen[ndx].index, ndx, nodeGen[ndx].word, cnt) ;
                                }
                                cnt ++;

                                if (loops > 5)
                                {
                                    nextBucket = nextBuckets(loops - 6, okick, oob,  buckets);
                                    if (printFlag)
                                    {
                                        printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                    }

                                    nodeGen[ndx].left  = &nodeGen[nextBucket[0]];
                                    nodeGen[ndx].right = &nodeGen[nextBucket[1]];

                                    if (ndx > pow(2, loops) - 1  && oddlot)
                                    {
                                        used[ndx] = ndx;
                                        if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                        {
                                            nodeGen[ndx].left = NULL;
                                        }
                                        if (nextBucket[1] <= ndx)
                                        {
                                            nodeGen[ndx].right = NULL;
                                        }
                                    }
                                }
                                else
                                {
                                    nodeGen[ndx].left  = NULL;
                                    nodeGen[ndx].right = NULL;
                                }
                                if (printFlag)
                                {
                                    printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[ndx], ndx, nodeGen[ndx].left, nlx, nodeGen[ndx].right, nrx);
                                }
                            }
                            //comments
                            for (int o = 0; o < ((buckets > 63) ? 2 : 0); o++)
                            {
                                if (cnt < buckets)
                                {
                                    odx   = pow(2, loops - 6) - 1  + okick;
                                    okick = okick + pow(2, loops - 5);
                                    if (odx > buckets - 1 - oob)
                                    {
                                        while (odx > buckets - 1 - oob)
                                        {
                                            odx = odx - 1;
                                        }
                                        oob++;
                                    }
                                    nodeGen[odx].index = odx;
                                    if (printFlag)
                                    {
                                        printf("index = %d  odx = %d word = %s cnt = %d \n", nodeGen[odx].index, odx, nodeGen[odx].word, cnt) ;
                                    }
                                    cnt ++;

                                    if (loops > 6)
                                    {
                                        nextBucket = nextBuckets(loops - 7, pkick, oob,  buckets);
                                        if (printFlag)
                                        {
                                            printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                        }

                                        nodeGen[odx].left  = &nodeGen[nextBucket[0]];
                                        nodeGen[odx].right = &nodeGen[nextBucket[1]];

                                        if (odx > pow(2, loops) - 1  && oddlot)
                                        {
                                            used[odx] = odx;
                                            if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                            {
                                                nodeGen[odx].left = NULL;
                                            }
                                            if (nextBucket[1] <= odx)
                                            {
                                                nodeGen[odx].right = NULL;
                                            }
                                        }
                                    }
                                    else
                                    {
                                        nodeGen[odx].left  = NULL;
                                        nodeGen[odx].right = NULL;
                                    }
                                    if (printFlag)
                                    {
                                        printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[odx], odx, nodeGen[odx].left, olx, nodeGen[odx].right, orx);
                                    }
                                }
                                //comments
                                for (int p = 0; p < ((buckets > 127) ? 2 : 0); p++)
                                {
                                    if (cnt < buckets)
                                    {
                                        pdx   = pow(2, loops - 7) - 1  + pkick;
                                        pkick = pkick + pow(2, loops - 6);
                                        if (pdx > buckets - 1 - oob)
                                        {
                                            while (pdx > buckets - 1 - oob)
                                            {
                                                pdx = pdx - 1;
                                            }
                                            oob++;
                                        }
                                        nodeGen[pdx].index = pdx;
                                        if (printFlag)
                                        {
                                            printf("index = %d  pdx = %d word = %s cnt = %d \n", nodeGen[pdx].index, pdx, nodeGen[pdx].word, cnt) ;
                                        }
                                        cnt ++;

                                        if (loops > 7)
                                        {
                                            nextBucket = nextBuckets(loops - 8, qkick, oob,  buckets);
                                            if (printFlag)
                                            {
                                                printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                            }

                                            nodeGen[pdx].left  = &nodeGen[nextBucket[0]];
                                            nodeGen[pdx].right = &nodeGen[nextBucket[1]];

                                            if (pdx > pow(2, loops) - 1  && oddlot)
                                            {
                                                used[pdx] = pdx;
                                                if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                                {
                                                    nodeGen[pdx].left = NULL;
                                                }
                                                if (nextBucket[1] <= pdx)
                                                {
                                                    nodeGen[pdx].right = NULL;
                                                }
                                            }
                                        }
                                        else
                                        {
                                            nodeGen[pdx].left  = NULL;
                                            nodeGen[pdx].right = NULL;
                                        }
                                        if (printFlag)
                                        {
                                            printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[pdx], pdx, nodeGen[pdx].left, plx, nodeGen[pdx].right, prx);
                                        }
                                    }
                                    //comments
                                    for (int q = 0; q < ((buckets > 255) ? 2 : 0); q++)
                                    {
                                        if (cnt < buckets)
                                        {
                                            qdx   = pow(2, loops - 8) - 1  +  qkick;
                                            qkick = qkick + pow(2, loops - 7);
                                            if (qdx > buckets - 1 - oob)
                                            {
                                                while (qdx > buckets - 1 - oob)
                                                {
                                                    qdx = qdx - 1;
                                                }
                                                oob++;
                                            }
                                            nodeGen[qdx].index = qdx;
                                            if (printFlag)
                                            {
                                                printf("index = %d  qdx = %d word = %s cnt = %d \n", nodeGen[qdx].index, qdx, nodeGen[qdx].word, cnt) ;
                                            }
                                            cnt ++;

                                            nodeGen[qdx].left  = NULL;
                                            nodeGen[qdx].right = NULL;

                                            if (printFlag)
                                            {
                                                printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[qdx], qdx, nodeGen[qdx].left, qlx, nodeGen[qdx].right, qrx);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    return (&nodeGen[idx]);
}

int *nextBuckets(int power, int kick, int oob, int buckets)
{
    //comments
    static int  next[2];

    for (int i = 0; i < 2; i++)
    {
        next[i]   = pow(2, power) - 1  + kick;
        kick = kick + pow(2, power + 1);

        if (next[i] > buckets - 1 - oob)
        {
            while (next[i] > buckets - 1 - oob)
            {
                next[i] = next[i] -  1 ;
            }
            oob++;
        }
    }
    return next;
}

performance

回答 1

Code Review用户

回答已采纳

发布于 2020-10-02 22:34:53

不按字符读取文件字符

一次读取一个文件一个字符是很慢的。即使标准输入被缓冲，您仍然有单独调用fgetc()的开销，每次都必须检查输入缓冲区中是否至少有一个字符，如果是这样，则从缓冲区中删除它并返回它，否则在缓冲区中读取更多的数据。

处理该文件的最快方法可能是mmap()它，尽管这不是可移植的C。相反，可以考虑使用fgets()读取整行，或者每次使用fread()读取几千字节。然后循环遍历刚才读取的字符行/数组。

请注意，如果您以整行方式阅读，并且您知道没有连字符，那么您始终知道您已经读取了所有单词，因此您不再需要将字符复制到word[]中，您只需要跟踪单词的开始和结束。

在提交

中不包括基准代码

请注意，对getrusage()的调用本身可能会很昂贵，所以不要在提交的内容中包括这一点。实际上，我根本不会调用getrusage()，而是使用time或Linux的perf这样的外部工具来测量程序使用的时间或CPU周期。

如果可能的话，

避免分配内存，

在构建字典时，首先分配内存在文件中读取，然后为一个单词数组分配内存，然后为散列桶分配内存，然后为每个桶为存储在其中的单词分配一些内存。基本上，记忆中的每个单词都有三个副本。这是在构建trie之前，trie每个字都有大量的分配。

这是对内存的巨大浪费，复制所有数据也不是免费的。如果字典比输入文本大，那也是一种巨大的浪费。因此，尽可能地删除多余的副本和分配。

我还看到您在malloc()中调用hash()，只是为了一个小的临时缓冲区。在本例中，只需使用一个简单的数组：

char lcWord[LENGTH + 1];

加载字典

的更快方法

问题规范提到，字典列出了已按字典排序的所有单词。除非要拼写检查的文本比字典有更多的单词，否则我甚至不会麻烦地对它进行散列或创建一棵树，而只是使用mmap()，并使用二进制搜索从字典中的输入中查找单词。但是让我们假设我们只需要使用标准的C函数，并且您确实希望构建一个trie。

首先，像您已经在内存中所做的那样加载字典。请注意，现在您已经将字典中的所有单词都存储在内存中，因此您不再需要复制它们，而只需在缓冲区中创建指针即可。您可以用NUL字节替换所有换行符，这样所有的单词都是有效的C字符串。

在构建trie时，可以使用字典已排序的事实。

Hash？崔？两者都是？

我很难理解你是如何编制词典索引的。hash()函数看起来一点也不像散列函数，而是进行二进制搜索。散列函数的目的是避免任何类型的树遍历。在您的hash()返回之后，您将在check()中执行另一个二进制搜索。我想这会让你两败俱伤。

问题陈述清楚地指出，您应该实现“使用哈希表”的拼写检查器。因此，我将完全避免使用trie，只需专注于编写一个好的哈希表实现。如果您真的想使用trie，那么就不要使用hash()，只需将字典编码为一个trie即可。

票数 3

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/250120

复制

相似问题

问拼写检查挑战-实现字典查找
EN

回答 1

Code Review用户

不按字符读取文件字符

在提交

避免分配内存，

加载字典

Hash？崔？两者都是？

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问拼写检查挑战-实现字典查找EN

回答 1

Code Review用户

不按字符读取文件字符

在提交

避免分配内存，

加载字典

Hash？崔？两者都是？

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问拼写检查挑战-实现字典查找
EN