首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >拼写检查挑战-实现字典查找

拼写检查挑战-实现字典查找
EN

Code Review用户
提问于 2020-10-02 17:02:02
回答 1查看 63关注 0票数 3

我正在努力加快这个节目的执行速度。唯一可以更改的文件是dictionary.c。

问题基本上是加载字典,然后对提交的文本进行拼写检查。我们使用的方法是假设字典已经被排序(确实如此),然后在trie中创建一个哈希表,然后多次尝试加载该字典。所有这些都是为了使检查函数更快。不巧的是,虽然程序运行,但它的性能没有那么好。您可以在这里看到它相对于其他提交文件的表现:https://speller.cs50.net/cs50/problems/2020/x/challenges/speller,我被列在3004号

speller.c

代码语言:javascript
复制
// Implements a spell-checker

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>

#include "dictionary.h"

// Undefine any definitions
#undef calculate
#undef getrusage

// Default dictionary
#define DICTIONARY "dictionaries/large"

// Prototype
double calculate(const struct rusage *b, const struct rusage *a);

int main(int argc, char *argv[])
{
    // Check for correct number of args
    if (argc != 2 && argc != 3)
    {
        printf("Usage: ./speller [DICTIONARY] text\n");
        return 1;
    }

    // Structures for timing data
    struct rusage before, after;

    // Benchmarks
    double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

    // Determine dictionary to use
    char *dictionary = (argc == 3) ? argv[1] : DICTIONARY;

    // Load dictionary
    getrusage(RUSAGE_SELF, &before);
    bool loaded = load(dictionary);
    getrusage(RUSAGE_SELF, &after);

    // Exit if dictionary not loaded
    if (!loaded)
    {
        printf("Could not load %s.\n", dictionary);
        return 1;
    }

    // Calculate time to load dictionary
    time_load = calculate(&before, &after);

    // Try to open text
    char *text = (argc == 3) ? argv[2] : argv[1];
    FILE *file = fopen(text, "r");
    if (file == NULL)
    {
        printf("Could not open %s.\n", text);
        unload();
        return 1;
    }

    // Prepare to report misspellings
    printf("\nMISSPELLED WORDS\n\n");

    // Prepare to spell-check
    int index = 0, misspellings = 0, words = 0;
    char word[LENGTH + 1];

    // Spell-check each word in text
    for (int c = fgetc(file); c != EOF; c = fgetc(file))
    {
        // Allow only alphabetical characters and apostrophes
        if (isalpha(c) || (c == '\'' && index > 0))
        {
            // Append character to word
            word[index] = c;
            index++;

            // Ignore alphabetical strings too long to be words
            if (index > LENGTH)
            {
                // Consume remainder of alphabetical string
                while ((c = fgetc(file)) != EOF && isalpha(c));

                // Prepare for new word
                index = 0;
            }
        }

        // Ignore words with numbers (like MS Word can)
        else if (isdigit(c))
        {
            // Consume remainder of alphanumeric string
            while ((c = fgetc(file)) != EOF && isalnum(c));

            // Prepare for new word
            index = 0;
        }

        // We must have found a whole word
        else if (index > 0)
        {
            // Terminate current word
            word[index] = '\0';

            // Update counter
            words++;

            // Check word's spelling
            getrusage(RUSAGE_SELF, &before);

            bool misspelled = !check(word);

            getrusage(RUSAGE_SELF, &after);

            // Update benchmark
            time_check += calculate(&before, &after);

            // Print word if misspelled
            if (misspelled)
            {
                printf("%s\n", word);
                misspellings++;
            }

            // Prepare for next word
            index = 0;
        }
    }

    // Check whether there was an error
    if (ferror(file))
    {
        fclose(file);
        printf("Error reading %s.\n", text);
        unload();
        return 1;
    }

    // Close text
    fclose(file);

    // Determine dictionary's size
    getrusage(RUSAGE_SELF, &before);
    unsigned int n = size();
    getrusage(RUSAGE_SELF, &after);

    // Calculate time to determine dictionary's size
    time_size = calculate(&before, &after);

    // Unload dictionary
    getrusage(RUSAGE_SELF, &before);
    bool unloaded = unload();
    getrusage(RUSAGE_SELF, &after);

    // Abort if dictionary not unloaded
    if (!unloaded)
    {
        printf("Could not unload %s.\n", dictionary);
        return 1;
    }

    // Calculate time to unload dictionary
    time_unload = calculate(&before, &after);

    // Report benchmarks
    printf("\nWORDS MISSPELLED:     %d\n", misspellings);
    printf("WORDS IN DICTIONARY:  %d\n", n);
    printf("WORDS IN TEXT:        %d\n", words);
    printf("TIME IN load:         %.2f\n", time_load);
    printf("TIME IN check:        %.2f\n", time_check);
    printf("TIME IN size:         %.2f\n", time_size);
    printf("TIME IN unload:       %.2f\n", time_unload);
    printf("TIME IN TOTAL:        %.2f\n\n",
           time_load + time_check + time_size + time_unload);

    // Success
    return 0;
}

// Returns number of seconds between b and a
double calculate(const struct rusage *b, const struct rusage *a)
{
    if (b == NULL || a == NULL)
    {
        return 0.0;
    }
    else
    {
        return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
                  (b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
                 ((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
                  (b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
                / 1000000.0);
    }
}

dictionary.h

代码语言:javascript
复制
// Declares a dictionary's functionality

#ifndef DICTIONARY_H
#define DICTIONARY_H

#include <stdbool.h>

// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45

// Prototypes
bool check(const char *word);
unsigned int hash(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);

#endif // DICTIONARY_H

dictionary.c

//实现字典的功能

代码语言:javascript
复制
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include "dictionary.h"
#include <limits.h>
#include <math.h>
#include <ctype.h>

#define MAXWORDS 200000
#define MAXBUCKETS 511

unsigned int numberOfWords = 0;
int target;
unsigned int N = 1;  //number of buckets in hash table. (a function of how big the dictionary is)


// we modified nodeWords to containe more information
typedef struct nodeWords
{
    char  *word;
    char  *wordEnd;
    int    index;
    struct nodeWords *left;
    struct nodeWords *right;
} nodeWords;

nodeWords  *headHashNode;
nodeWords **nodeArray;
nodeWords *nodeHash;
nodeWords *nodeTri;
nodeWords **nodeHistory;

int hashBuckets;

int hashFlag = 0;

char (*words)[LENGTH + 1] = NULL;


long int findSize(const char *file_name);
unsigned int size(void);
unsigned int hash(const char *word);
bool check(const char *word);
bool unload(void);

nodeWords *leftRightAddress(int buckets, nodeWords *nodeGen);
int *nextBuckets(int power, int kick, int oob, int buckets);

//used for testing
typedef struct nodeNumbers
{
    int number;
    struct nodeNumbers *left;
    struct nodeNumbers *right;
} nodeNumbers;


// Returns true if word is in dictionary else false
bool check(const char *word)
{

    // we created tries for both the hash table and dictionary

    int lc = strlen(word);;
    char  *lcWord = malloc(46);


    for (int i = 0; i < lc; i++)
    {

        lcWord[i] = tolower(word[i]);
    }
    lcWord[lc] = '\0';


    int nodeIdx = hash(lcWord);
    if (nodeIdx < 0)
    {
        free(lcWord);
        return false;
    }

    nodeWords *searchNode;
    searchNode = nodeArray[nodeIdx];
    bool searchContinues = true, found = false;

    // this is our binary searech

    do
    {
        if (strcmp(lcWord, searchNode->word) == 0)
        {
            searchContinues = false;
            found = true;
        }
        else if (strcmp(lcWord, searchNode->word) < 0)
        {
            if (searchNode->left == NULL)
            {
                searchContinues = false;
            }
            else
            {
                searchNode = searchNode->left;
            }
        }
        else
        {
            if (searchNode->right == NULL)
            {
                searchContinues = false;
            }
            else
            {
                searchNode = searchNode->right;
            }
        }
    }
    while (searchContinues);

    free(lcWord);
    return found;
}
unsigned int hash(const char *word)
{

    // we assume the dictionary is already sorted

    nodeWords *node;
    node = headHashNode;
    int idx = -9;
    int lc = strlen(word);;
    char *lcWord = malloc(46);
    bool rightFlag = false;
    bool leftFag = false;

    for (int i = 0; i < lc; i++)
    {

        lcWord[i] = tolower(word[i]);
    }
    lcWord[lc] = '\0';

    // printf("HASH\n");
    // printf("%s %s %s\n",lcWord, words[0],words[numberOfWords - 1]);

    if (strcmp(lcWord, words[0]) < 0 || strcmp(lcWord, words[numberOfWords - 1]) > 0)
    {
        free(lcWord);
        return -1;
    }
    do
    {
        // printf("%s %s %s %d\n",lcWord, node->word,node->wordEnd,strcmp(lcWord,node->word));

        if (strcmp(lcWord, node->word) == 0)
        {
            //printf("FOUND 0\n");
            free(lcWord);
            return node->index;

        }
        else if (strcmp(lcWord, node->word) > 0)
        {
            if (strcmp(lcWord, node->wordEnd) <= 0 || node->right == NULL)
            {
                //printf("FOUND 1\n");
                free(lcWord);
                return node->index;
            }
            // printf("RIGHT\n");


            node = node->right;



        }
        else
        {
            // printf("LEFT\n");
            if (node->left == NULL)
            {
                return -1;
            }
            else
            {
                node = node->left;
            }
        }


    }
    while (true);
}




// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
    // messy but does the job

    // printf("enter load\n");
    //find number of bytes in dictionary file
    long int dictionarySize = findSize(dictionary);
    if (dictionarySize == -1)
    {
        //printf("File Not Found 0!\n");
        return false;
    }

    //printf("dictsize %ld\n",dictionarySize);

    //allocate memory to read in entire file
    char *data = malloc(dictionarySize + 1);

    //open dictionary file
    int filedesc = open(dictionary, O_RDONLY);
    if (filedesc < 0)
    {
        //printf("File Not Found 1!\n");
        return false;
    }
    //trying to gain load speed

    //read entire dictionary
    int bytesRead = read(filedesc, data, dictionarySize);
    close(filedesc);
    //printf("bytes %d\n",bytesRead);

    FILE *fp;
    fp = fopen("dict.txt", "w+");

    int row = 0;
    int col = 0;
    words = malloc(MAXWORDS * (LENGTH + 1));

    //parse dictionary string based on new lines and replace new line char with \0 (null)
    for (int i = 0; i < dictionarySize ; i++)
    {
        words[row][col] = data[i];
        col++;
        if (data[i] == 0x0a)
        {
            words[row][col - 1] = '\0';
            //printf("%d  %s\n", row , words[row]);
            col = 0;
            row++;
        }

    }

    //printf("col = %d %s\n",col,words[row-1] );

    if (col != 0)
    {
        words[row][col] = '\0';
        //printf("extra %d  %s\n", row , words[row]);
    }
    else
    {
        row--;
    }


    numberOfWords = row + 1;
    fclose(fp);
    free(data);

    //compute number of hash buckets
    target = (int) pow(2, ceil(log2(pow(numberOfWords, .5)))) - 1; //target = number of buckets per tri (except maybe the last one.)
    if (target == 0)
    {
        target = 1;

    }


    hashBuckets = ceil((float)numberOfWords /
                       target); //   hashBuckets = number of buckets for hash table - each bucket will point to a tri

    //printf("words %d target %d hashbuckets %d\n",numberOfWords,target,hashBuckets);

    if (hashBuckets > MAXBUCKETS)
    {
        printf("FATAL ERROR: Too many hash buckets\n");
        exit(-1);
    }
    //allocate space for hash buckets
    int cnt = 0;
    nodeHash = malloc(hashBuckets * sizeof(*nodeHash));

    for (int i =  0; i < numberOfWords ; i = i + target)
    {
        int endIdx = (i + target - 1 < numberOfWords) ? i + target - 1 : numberOfWords - 1;

        nodeHash[cnt].word = malloc((strlen(words[i]) + 1) * sizeof(char));
        strcpy(nodeHash[cnt].word, words[i]);

        nodeHash[cnt].wordEnd = malloc((strlen(words[endIdx]) + 1) * sizeof(char));
        strcpy(nodeHash[cnt].wordEnd, words[endIdx]);
        cnt++;
    }


    headHashNode = leftRightAddress(hashBuckets, nodeHash);


    int triCnt;
    cnt = 0;

    nodeArray = malloc(hashBuckets * sizeof(*nodeHash));

    nodeHistory = malloc(hashBuckets * sizeof(*nodeHash));

    for (int i = 0; i < numberOfWords; i = i + target)
    {
        nodeTri = malloc(target * sizeof(*nodeTri));
        nodeHistory[cnt] = nodeTri;


        triCnt = 0;
        for (int j = i, stopper = (i + target < numberOfWords) ? i + target : numberOfWords; j < stopper; j++)
        {
            nodeTri[triCnt].word = malloc((strlen(words[j]) + 1) * sizeof(char));
            strcpy(nodeTri[triCnt].word, words[j]);

            int endIdx = (j + target - 1 < numberOfWords) ? j + target - 1 : numberOfWords - 1;
            nodeTri[triCnt].wordEnd = malloc((strlen(words[endIdx]) + 1) * sizeof(char));
            strcpy(nodeTri[triCnt].wordEnd, words[endIdx]);

            triCnt++;
        }

        nodeArray[cnt] = leftRightAddress(triCnt, nodeTri);

        cnt++;


    }




    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
    return numberOfWords;
}

// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
    // quick and easy

    for (int i = 0; i < hashBuckets; i++)
    {
        free(nodeHash[i].word);
        free(nodeHash[i].wordEnd);
    }
    free(nodeHash);// TODO

    int cnt = 0;

    for (int i = 0; i < hashBuckets; i++)
    {
        nodeTri = nodeHistory[i];
        for (int j = 0; j < target; j++)
        {
            if (cnt < numberOfWords)
            {
                free(nodeTri[j].word);
                free(nodeTri[j].wordEnd);
            }
            cnt++;

        }
        free(nodeTri);
    }

    free(words);

    free(nodeArray);
    free(nodeHistory);

    return true;
}
long int findSize(const char *file_name)
{


    // opening the file in read mode
    FILE *fp = fopen(file_name, "r");

    // checking if the file exist or not
    if (fp == NULL)
    {
        //printf("File Not Found!\n");
        return -1;
    }

    fseek(fp, 0L, SEEK_END);

    // calculating the size of the file
    long int size = ftell(fp);

    // closing the file
    fclose(fp);

    return size;
}

nodeWords *leftRightAddress(int buckets, nodeWords *nodeGen)
{
    //did not use recursion for speed

    bool printFlag = false;

    int numbers[buckets] ;

    for (int i = 0; i < buckets; i++)
    {
        numbers[i] = i;
    }

    int loops = (int)(log2(buckets) + .000001),
        cnt   = 0,
        oob   = 0,
        rob   = 0;

    int idx, ilx = -100, irx = -100,
             jdx, jlx = -100, jrx = -100,
                  kdx, klx = -100, krx = -100,
                       ldx, llx = -100, lrx = -100,
                            mdx, mlx = -100, mrx = -100,
                                 ndx, nlx = -100, nrx = -100,
                                      odx, olx = -100, orx = -100,
                                           pdx, plx = -100, prx = -100,
                                                qdx, qlx = -100, qrx = -100;

    int ikick = 0, ijmp = 0,
        jkick = 0, jjmp = 0,
        kkick = 0, kjmp = 0,
        lkick = 0, ljmp = 0,
        mkick = 0, mjmp = 0,
        nkick = 0, njmp = 0,
        okick = 0, ojmp = 0,
        pkick = 0, pjmp = 0,
        qkick = 0, qjmp = 0;

    int *nextBucket;
    bool oddlot = false;
    int used[buckets];

    if (buckets != (int)pow(2, loops + 1) - 1)
    {
        for (int i = 0; i < buckets; i++)
        {
            used[i] = -99;
        }
        oddlot = true;
    }


    if (printFlag)
    {
        printf("loops= %d buckets=%d  \n", loops, buckets);
    }

    for (int i = 0; i < buckets; i++)
    {
        if (printFlag)
        {
            printf("%d   %s   %p  \n", i, nodeGen[i].word, &nodeGen[i]);
        }
    }
    //comments

    for (int i = 0; i < 1; i++)
    {
        if (cnt < buckets)
        {
            idx = pow(2, loops) - 1;
            nodeGen[idx].index = idx;

            if (printFlag)
            {
                printf("index = %d  idx = %d word = %s cnt = %d \n", nodeGen[idx].index, idx, nodeGen[idx].word, cnt);
            }
            cnt ++;

            if (loops > 0)
            {
                nextBucket = nextBuckets(loops - 1, jkick, oob,  buckets);
                if (printFlag)
                {
                    printf("NEXT %d %d %s\n", nextBucket[0], nextBucket[1], nodeGen[idx].word);
                }

                nodeGen[idx].left  = &nodeGen[nextBucket[0]];
                nodeGen[idx].right = &nodeGen[nextBucket[1]];

                //printf("VALUES %f %d idx=%d i=%d\n",pow(2,loops)-1,oddlot,idx,i);

                if (idx > pow(2, loops) - 1 && oddlot)
                {
                    used[idx] = idx;
                    if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                    {
                        nodeGen[idx].left = NULL;
                    }
                }
                if (nextBucket[1] <= idx)
                {
                    nodeGen[idx].right = NULL;
                }
            }
            else
            {
                nodeGen[idx].left  = NULL;
                nodeGen[idx].right = NULL;
            }
            if (printFlag)
            {
                printf("node idx = %p %d  left: %p  %d  right: %p  %d  oob: %d  rob:  %d\n\n", &nodeGen[idx], idx, nodeGen[idx].left, ilx,
                       nodeGen[idx].right, irx, oob, rob);
            }
        }
        //comments
        for (int j = 0; j < ((buckets > 1) ? 2 : 0); j++)
        {
            if (cnt < buckets)
            {
                jdx   = pow(2, loops - 1) - 1  + jkick;
                jkick = jkick + pow(2, loops);
                if (jdx > buckets - 1 - oob)
                {
                    while (jdx > buckets - 1 - oob)
                    {
                        jdx = jdx -  1 ;
                    }
                    oob++;
                }
                nodeGen[jdx].index = jdx;
                if (printFlag)
                {
                    printf("index = %d  jdx = %d word = %s cnt = %d \n", nodeGen[jdx].index, jdx, nodeGen[jdx].word, cnt) ;
                }
                cnt ++;

                if (loops > 1)
                {
                    nextBucket = nextBuckets(loops - 2, kkick, oob,  buckets);
                    if (printFlag)
                    {
                        printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                    }

                    nodeGen[jdx].left  = &nodeGen[nextBucket[0]];
                    nodeGen[jdx].right = &nodeGen[nextBucket[1]];

                    if (jdx > pow(2, loops) - 1  && oddlot)
                    {
                        used[jdx] = jdx;
                        if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                        {
                            nodeGen[jdx].left = NULL;
                        }
                        if (nextBucket[1] <= jdx)
                        {
                            nodeGen[jdx].right = NULL;
                        }
                    }
                }
                else
                {
                    nodeGen[jdx].left  = NULL;
                    nodeGen[jdx].right = NULL;
                }
                if (printFlag)
                {
                    printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[jdx], jdx, nodeGen[jdx].left, jlx, nodeGen[jdx].right, jrx);
                }
            }
            //comments
            for (int k = 0; k < ((buckets > 3) ? 2 : 0); k++)
            {
                if (cnt < buckets)
                {
                    kdx   = pow(2, loops - 2) - 1  + kkick;
                    kkick = kkick + pow(2, loops - 1);
                    if (kdx > buckets - 1 - oob)
                    {
                        while (kdx > buckets - 1 - oob)
                        {
                            kdx = kdx -  1 ;
                        }
                        oob++;
                    }
                    nodeGen[kdx].index = kdx;
                    if (printFlag)
                    {
                        printf("index = %d  kdx = %d word = %s cnt = %d \n", nodeGen[kdx].index, kdx, nodeGen[kdx].word, cnt) ;
                    }
                    cnt ++;

                    if (loops > 2)
                    {
                        nextBucket = nextBuckets(loops - 3, lkick, oob,  buckets);
                        if (printFlag)
                        {
                            printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                        }

                        nodeGen[kdx].left  = &nodeGen[nextBucket[0]];
                        nodeGen[kdx].right = &nodeGen[nextBucket[1]];

                        if (kdx > pow(2, loops) - 1  && oddlot)
                        {
                            used[kdx] = kdx;
                            if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                            {
                                nodeGen[kdx].left = NULL;
                            }
                            if (nextBucket[1] <= kdx)
                            {
                                nodeGen[kdx].right = NULL;
                            }
                        }
                    }
                    else
                    {
                        nodeGen[kdx].left  = NULL;
                        nodeGen[kdx].right = NULL;
                    }
                    if (printFlag)
                    {
                        printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[kdx], kdx, nodeGen[kdx].left, klx, nodeGen[kdx].right, krx);
                    }
                }
                //comments
                for (int l = 0; l < ((buckets > 7) ? 2 : 0); l++)
                {
                    if (cnt < buckets)
                    {
                        ldx   = pow(2, loops - 3) - 1  + lkick;
                        lkick = lkick + pow(2, loops - 2);
                        if (ldx > buckets - 1 - oob)
                        {
                            while (ldx > buckets - 1 - oob)
                            {
                                ldx = ldx - 1;
                            }
                            oob++;
                        }
                        nodeGen[ldx].index = ldx;
                        if (printFlag)
                        {
                            printf("index = %d  ldx = %d word = %s cnt = %d \n", nodeGen[ldx].index, ldx, nodeGen[ldx].word, cnt) ;
                        }
                        cnt ++;

                        if (loops > 3)
                        {
                            nextBucket = nextBuckets(loops - 4, mkick, oob,  buckets);
                            if (printFlag)
                            {
                                printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                            }

                            nodeGen[ldx].left  = &nodeGen[nextBucket[0]];
                            nodeGen[ldx].right = &nodeGen[nextBucket[1]];

                            if (ldx > pow(2, loops) - 1  && oddlot)
                            {
                                used[ldx] = ldx;
                                if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                {
                                    nodeGen[ldx].left = NULL;
                                }
                                if (nextBucket[1] <= ldx)
                                {
                                    nodeGen[ldx].right = NULL;
                                }
                            }
                        }
                        else
                        {
                            nodeGen[ldx].left  = NULL;
                            nodeGen[ldx].right = NULL;
                        }
                        if (printFlag)
                        {
                            printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[ldx], ldx, nodeGen[ldx].left, llx, nodeGen[ldx].right, lrx);
                        }
                    }
                    //comments
                    for (int m = 0; m < ((buckets > 15) ? 2 : 0); m++)
                    {
                        if (cnt < buckets)
                        {
                            mdx   = pow(2, loops - 4) - 1  + mkick;
                            mkick = mkick + pow(2, loops - 3);
                            if (mdx > buckets - 1 - oob)
                            {
                                while (mdx > buckets - 1 - oob)
                                {
                                    mdx = mdx - 1;
                                }
                                oob++;
                            }
                            nodeGen[mdx].index = mdx;
                            if (printFlag)
                            {
                                printf("index = %d  mdx = %d word = %s cnt = %d \n", nodeGen[mdx].index, mdx, nodeGen[mdx].word, cnt) ;
                            }
                            cnt ++;

                            if (loops > 4)
                            {
                                nextBucket = nextBuckets(loops - 5, nkick, oob,  buckets);
                                if (printFlag)
                                {
                                    printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                }

                                nodeGen[mdx].left  = &nodeGen[nextBucket[0]];
                                nodeGen[mdx].right = &nodeGen[nextBucket[1]];

                                if (mdx > pow(2, loops) - 1  && oddlot)
                                {
                                    used[mdx] = mdx;
                                    if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                    {
                                        nodeGen[mdx].left = NULL;
                                    }
                                    if (nextBucket[1] <= mdx)
                                    {
                                        nodeGen[mdx].right = NULL;
                                    }
                                }
                            }
                            else
                            {
                                nodeGen[mdx].left  = NULL;
                                nodeGen[mdx].right = NULL;
                            }
                            if (printFlag)
                            {
                                printf("node = %p %d  left: %p  %d  right: %p  %d oob: %d  rob: %d\n\n", &nodeGen[mdx], mdx, nodeGen[mdx].left, mlx,
                                       nodeGen[mdx].right, mrx, oob, rob);
                            }
                        }
                        //comments
                        for (int n = 0; n < ((buckets > 31) ? 2 : 0); n++)
                        {
                            if (cnt < buckets)
                            {
                                ndx   = pow(2, loops - 5) - 1  + nkick;
                                nkick = nkick + pow(2, loops - 4);
                                if (ndx > buckets - 1 - oob)
                                {
                                    while (ndx > buckets - 1 - oob)
                                    {
                                        ndx = ndx - 1;
                                    }
                                    oob++;
                                }
                                nodeGen[ndx].index = ndx;
                                if (printFlag)
                                {
                                    printf("index = %d  ndx = %d word = %s cnt = %d \n", nodeGen[ndx].index, ndx, nodeGen[ndx].word, cnt) ;
                                }
                                cnt ++;

                                if (loops > 5)
                                {
                                    nextBucket = nextBuckets(loops - 6, okick, oob,  buckets);
                                    if (printFlag)
                                    {
                                        printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                    }

                                    nodeGen[ndx].left  = &nodeGen[nextBucket[0]];
                                    nodeGen[ndx].right = &nodeGen[nextBucket[1]];

                                    if (ndx > pow(2, loops) - 1  && oddlot)
                                    {
                                        used[ndx] = ndx;
                                        if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                        {
                                            nodeGen[ndx].left = NULL;
                                        }
                                        if (nextBucket[1] <= ndx)
                                        {
                                            nodeGen[ndx].right = NULL;
                                        }
                                    }
                                }
                                else
                                {
                                    nodeGen[ndx].left  = NULL;
                                    nodeGen[ndx].right = NULL;
                                }
                                if (printFlag)
                                {
                                    printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[ndx], ndx, nodeGen[ndx].left, nlx, nodeGen[ndx].right, nrx);
                                }
                            }
                            //comments
                            for (int o = 0; o < ((buckets > 63) ? 2 : 0); o++)
                            {
                                if (cnt < buckets)
                                {
                                    odx   = pow(2, loops - 6) - 1  + okick;
                                    okick = okick + pow(2, loops - 5);
                                    if (odx > buckets - 1 - oob)
                                    {
                                        while (odx > buckets - 1 - oob)
                                        {
                                            odx = odx - 1;
                                        }
                                        oob++;
                                    }
                                    nodeGen[odx].index = odx;
                                    if (printFlag)
                                    {
                                        printf("index = %d  odx = %d word = %s cnt = %d \n", nodeGen[odx].index, odx, nodeGen[odx].word, cnt) ;
                                    }
                                    cnt ++;

                                    if (loops > 6)
                                    {
                                        nextBucket = nextBuckets(loops - 7, pkick, oob,  buckets);
                                        if (printFlag)
                                        {
                                            printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                        }

                                        nodeGen[odx].left  = &nodeGen[nextBucket[0]];
                                        nodeGen[odx].right = &nodeGen[nextBucket[1]];

                                        if (odx > pow(2, loops) - 1  && oddlot)
                                        {
                                            used[odx] = odx;
                                            if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                            {
                                                nodeGen[odx].left = NULL;
                                            }
                                            if (nextBucket[1] <= odx)
                                            {
                                                nodeGen[odx].right = NULL;
                                            }
                                        }
                                    }
                                    else
                                    {
                                        nodeGen[odx].left  = NULL;
                                        nodeGen[odx].right = NULL;
                                    }
                                    if (printFlag)
                                    {
                                        printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[odx], odx, nodeGen[odx].left, olx, nodeGen[odx].right, orx);
                                    }
                                }
                                //comments
                                for (int p = 0; p < ((buckets > 127) ? 2 : 0); p++)
                                {
                                    if (cnt < buckets)
                                    {
                                        pdx   = pow(2, loops - 7) - 1  + pkick;
                                        pkick = pkick + pow(2, loops - 6);
                                        if (pdx > buckets - 1 - oob)
                                        {
                                            while (pdx > buckets - 1 - oob)
                                            {
                                                pdx = pdx - 1;
                                            }
                                            oob++;
                                        }
                                        nodeGen[pdx].index = pdx;
                                        if (printFlag)
                                        {
                                            printf("index = %d  pdx = %d word = %s cnt = %d \n", nodeGen[pdx].index, pdx, nodeGen[pdx].word, cnt) ;
                                        }
                                        cnt ++;

                                        if (loops > 7)
                                        {
                                            nextBucket = nextBuckets(loops - 8, qkick, oob,  buckets);
                                            if (printFlag)
                                            {
                                                printf("NEXT %d %d\n", nextBucket[0], nextBucket[1]);
                                            }

                                            nodeGen[pdx].left  = &nodeGen[nextBucket[0]];
                                            nodeGen[pdx].right = &nodeGen[nextBucket[1]];

                                            if (pdx > pow(2, loops) - 1  && oddlot)
                                            {
                                                used[pdx] = pdx;
                                                if (nextBucket[0] <= pow(2, loops) - 1 || used[nextBucket[0]] == nextBucket[0])
                                                {
                                                    nodeGen[pdx].left = NULL;
                                                }
                                                if (nextBucket[1] <= pdx)
                                                {
                                                    nodeGen[pdx].right = NULL;
                                                }
                                            }
                                        }
                                        else
                                        {
                                            nodeGen[pdx].left  = NULL;
                                            nodeGen[pdx].right = NULL;
                                        }
                                        if (printFlag)
                                        {
                                            printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[pdx], pdx, nodeGen[pdx].left, plx, nodeGen[pdx].right, prx);
                                        }
                                    }
                                    //comments
                                    for (int q = 0; q < ((buckets > 255) ? 2 : 0); q++)
                                    {
                                        if (cnt < buckets)
                                        {
                                            qdx   = pow(2, loops - 8) - 1  +  qkick;
                                            qkick = qkick + pow(2, loops - 7);
                                            if (qdx > buckets - 1 - oob)
                                            {
                                                while (qdx > buckets - 1 - oob)
                                                {
                                                    qdx = qdx - 1;
                                                }
                                                oob++;
                                            }
                                            nodeGen[qdx].index = qdx;
                                            if (printFlag)
                                            {
                                                printf("index = %d  qdx = %d word = %s cnt = %d \n", nodeGen[qdx].index, qdx, nodeGen[qdx].word, cnt) ;
                                            }
                                            cnt ++;

                                            nodeGen[qdx].left  = NULL;
                                            nodeGen[qdx].right = NULL;

                                            if (printFlag)
                                            {
                                                printf("node = %p %d  left: %p  %d  right: %p  %d\n\n", &nodeGen[qdx], qdx, nodeGen[qdx].left, qlx, nodeGen[qdx].right, qrx);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    return (&nodeGen[idx]);
}

int *nextBuckets(int power, int kick, int oob, int buckets)
{
    //comments
    static int  next[2];

    for (int i = 0; i < 2; i++)
    {
        next[i]   = pow(2, power) - 1  + kick;
        kick = kick + pow(2, power + 1);

        if (next[i] > buckets - 1 - oob)
        {
            while (next[i] > buckets - 1 - oob)
            {
                next[i] = next[i] -  1 ;
            }
            oob++;
        }
    }
    return next;
}
EN

回答 1

Code Review用户

回答已采纳

发布于 2020-10-02 22:34:53

不按字符读取文件字符

一次读取一个文件一个字符是很慢的。即使标准输入被缓冲,您仍然有单独调用fgetc()的开销,每次都必须检查输入缓冲区中是否至少有一个字符,如果是这样,则从缓冲区中删除它并返回它,否则在缓冲区中读取更多的数据。

处理该文件的最快方法可能是mmap()它,尽管这不是可移植的C。相反,可以考虑使用fgets()读取整行,或者每次使用fread()读取几千字节。然后循环遍历刚才读取的字符行/数组。

请注意,如果您以整行方式阅读,并且您知道没有连字符,那么您始终知道您已经读取了所有单词,因此您不再需要将字符复制到word[]中,您只需要跟踪单词的开始和结束。

在提交

中不包括基准代码

请注意,对getrusage()的调用本身可能会很昂贵,所以不要在提交的内容中包括这一点。实际上,我根本不会调用getrusage(),而是使用time或Linux的perf这样的外部工具来测量程序使用的时间或CPU周期。

如果可能的话,

避免分配内存,

在构建字典时,首先分配内存在文件中读取,然后为一个单词数组分配内存,然后为散列桶分配内存,然后为每个桶为存储在其中的单词分配一些内存。基本上,记忆中的每个单词都有三个副本。这是在构建trie之前,trie每个字都有大量的分配。

这是对内存的巨大浪费,复制所有数据也不是免费的。如果字典比输入文本大,那也是一种巨大的浪费。因此,尽可能地删除多余的副本和分配。

我还看到您在malloc()中调用hash(),只是为了一个小的临时缓冲区。在本例中,只需使用一个简单的数组:

代码语言:javascript
复制
char lcWord[LENGTH + 1];

加载字典

的更快方法

问题规范提到,字典列出了已按字典排序的所有单词。除非要拼写检查的文本比字典有更多的单词,否则我甚至不会麻烦地对它进行散列或创建一棵树,而只是使用mmap(),并使用二进制搜索从字典中的输入中查找单词。但是让我们假设我们只需要使用标准的C函数,并且您确实希望构建一个trie。

首先,像您已经在内存中所做的那样加载字典。请注意,现在您已经将字典中的所有单词都存储在内存中,因此您不再需要复制它们,而只需在缓冲区中创建指针即可。您可以用NUL字节替换所有换行符,这样所有的单词都是有效的C字符串。

在构建trie时,可以使用字典已排序的事实。

Hash?崔?两者都是?

我很难理解你是如何编制词典索引的。hash()函数看起来一点也不像散列函数,而是进行二进制搜索。散列函数的目的是避免任何类型的树遍历。在您的hash()返回之后,您将在check()中执行另一个二进制搜索。我想这会让你两败俱伤。

问题陈述清楚地指出,您应该实现“使用哈希表”的拼写检查器。因此,我将完全避免使用trie,只需专注于编写一个好的哈希表实现。如果您真的想使用trie,那么就不要使用hash(),只需将字典编码为一个trie即可。

票数 3
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/250120

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档