首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >命令行字典工具,使用webscraping

命令行字典工具,使用webscraping
EN

Code Review用户
提问于 2020-04-06 19:13:33
回答 1查看 85关注 0票数 3

这是我完成以身作则https://tour.golang.org后的第一个合适的Go程序。我有Python的背景。

这个程序从华兹尼克中抓取定义,然后在命令行中很好地打印它们。它是为了在命令行中快速查找一个单词而制作的。

我希望有人能够回顾这段代码,并就效率低下问题提出建议,但特别是对于代码中任何不是惯用的、并不是Go代码的好例子的部分。为了突出显示一个部分,在代码的末尾,我使用一段通道来跟踪多个工作人员。我很高兴听到关于这一办法的意见。

代码语言:javascript
复制
package main

import (
    "errors"
    "fmt"
    "github.com/PuerkitoBio/goquery"
    "gopkg.in/gookit/color.v1"
    "net/http"
    "os"
    "sort"
    "strings"
    "text/tabwriter"
)

// definition is a struct for storing simple word definitions.
type definition struct {
    wordType string // noun, verb, interjection, intransitive verb, etc
    text     string // The actual definition itself
}

// ctxDefinition includes additional info about a definition.
type ctxDefinition struct {
    dict string // The dictionary the definition comes from
    rank uint8  // Where this definition is compared to the others
    def  definition
}

// byDictionary sorts ctxDefintions by rank and dictionary.
// Returns a map with dictionary names as keys, and definition slices as values
func byDictionary(cDs []ctxDefinition) map[string][]definition {
    pre := make(map[string][]ctxDefinition) // Used for ranking, not returned
    // Add all the defintions to the map
    for _, cD := range cDs {
        pre[cD.dict] = append(pre[cD.dict], cD)
    }
    // Sort by rank
    for k := range pre {
        sort.Slice(pre[k], func(i, j int) bool {
            return pre[k][i].rank < pre[k][j].rank
        })
    }
    // Convert to hold definitions only, not context
    m := make(map[string][]definition)
    for dict, cDs := range pre {
        for _, cD := range cDs {
            m[dict] = append(m[dict], cD.def)
        }
    }
    return m
}

// render returns a formatted definition, optionally with color.
// This contains some opinionted color defaults, as opposed to renderOps
func (d *definition) render(c bool) string {
    if c {
        return color.New(color.OpItalic).Render(d.wordType) + "\t" + d.text
    }
    return d.wordType + "\t" + d.text
}

// renderOps returns a formatted color definition, according to the provided styles.
func (d *definition) renderOps(wordType, text color.Style) string {
    return wordType.Render(d.wordType) + "\t\t" + text.Render(d.text)
}

// pprintCtxDefs pretty prints multiple context definitions, optionally with color.
func pprintCtxDefs(cDs []ctxDefinition, c bool) {
    m := byDictionary(cDs)
    w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
    //esc := string(tabwriter.Escape)
    for dict, defs := range m {
        if c {
            // Bracket dict name with escape characters so it's not part of the tabbing
            fmt.Fprintln(w, color.New(color.BgGray).Render(dict))
            // Print first definition differently
            fmt.Fprintf(w, "%s\n", defs[0].renderOps(color.New(color.OpItalic, color.OpBold), color.New(color.Cyan)))
            for _, def := range defs[1:] {
                fmt.Fprintf(w, "%s\n", def.render(true))
            }
        } else {
            fmt.Fprintf(w, dict+"\n")
            for _, def := range defs {
                fmt.Fprintf(w, "%s\n", def.render(false))
            }
        }
        fmt.Fprintln(w)
    }
    w.Flush()
}

// wordnikLookup returns a slice of ctxDefinitions for the provided word.
// Looks up words using wordnik.com
func wordnikLookup(w string, client *http.Client) ([]ctxDefinition, error) {
    req, err := http.NewRequest("GET", "https://www.wordnik.com/words/"+w, nil)
    if err != nil {
        panic(err)
    }
    req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
    resp, err := client.Do(req)
    if err != nil {
        return nil, errors.New("couldn't connect to wordnik")
    }
    defer resp.Body.Close()
    if resp.StatusCode != 200 {
        return nil, errors.New("200 not returned, likely a non-word like '../test' was passed")
    }
    doc, err := goquery.NewDocumentFromReader(resp.Body)
    if err != nil {
        return nil, errors.New("malformed HTML from wordnik")
    }
    ret := make([]ctxDefinition, 0)
    s := doc.Find(".word-module.module-definitions#define .guts.active").First()
    dicts := s.Find("h3")
    lists := s.Find("ul")
    // Go through each list of defs., then each def., and add them
    lists.Each(func(i int, list *goquery.Selection) {
        list.Find("li").Each(func(j int, def *goquery.Selection) {
            // wordType
            wT := def.Find("abbr").First().Text() + " " + def.Find("i").First().Text()
            wT = strings.TrimSpace(wT)
            // dictionary
            d := dicts.Get(i).FirstChild.Data[5:]             // strip the "from " prefix
            d = strings.ToUpper(string(d[0])) + string(d[1:]) // Capitalize first letter
            if string(d[len(d)-1]) == "." {                   // Remove ending period
                d = string(d[:len(d)-1])
            }
            // definition text - remove the wordType at the beginning of the definition
            t := strings.TrimSpace(def.Text()[len(wT):])
            t = strings.ToUpper(string(t[0])) + string(t[1:]) // Capitalize first letter
            ret = append(ret, ctxDefinition{
                dict: d,
                rank: uint8(j),
                def: definition{
                    wordType: wT,
                    text:     t,
                },
            })
        })
    })
    return ret, nil

}

func main() {
    if len(os.Args) <= 1 {
        fmt.Println("Provide a word to lookup.")
        return
    }
    // TODO: Support multiple words concurrently
    client := &http.Client{}
    words := os.Args[1:]
    // Lookup each word concurrently and store results
    results := make([]chan []ctxDefinition, 0)
    for i, word := range words {
        results = append(results, make(chan []ctxDefinition))
        go func(ind int, w string) {
            defs, err := wordnikLookup(w, client)
            if err != nil {
                panic(err)
            }
            results[ind] <- defs
        }(i, word)
    }

    // Print the answer of each word
    for i, result := range results {
        // TODO: Write to buffer, then flush after result comes in
        color.New(color.BgRed, color.White).Println(words[i])
        pprintCtxDefs(<-result, true)
    }
}

此代码是在GPL版本3下授权的。它将被上传到Github。任何想要重用或修改此代码的人都必须遵守该许可证。

EN

回答 1

Code Review用户

发布于 2020-04-07 12:03:59

主函数的两个循环是有问题的。

在两个切片上使用索引,假设它们的长度相同,这是非常复杂的。

第一个循环是无界的,这意味着如果我传递大量的单词,它将启动许多例程、请求等等。这肯定会给一些用户带来麻烦。

另外,第二个循环是次优的,因为它不等待最快的结果开始输出结果,而是等待其切片的第一个项。这意味着,如果第一个请求是,由于任何原因,缓慢,所有其他的结果,可能会更快将不会出现,直到第一个项目完成。在并发编程中,这绝对是不想要的行为。

剩下的代码是okish,我还没有挖掘那么多。

下面是您的主函数的更新版本,它使用更惯用的方式将数据(输入字、输出结果(包括可能的错误)输入和输出到具有更随意的同步机制的例程中。为了演示的目的,它还将并发请求的数量限制在4个。

代码语言:javascript
复制
package main

import (
    "errors"
    "fmt"
    "net/http"
    "os"
    "sort"
    "strings"
    "sync"
    "text/tabwriter"

    "github.com/PuerkitoBio/goquery"
    "github.com/gookit/color"
)

// definition is a struct for storing simple word definitions.
type definition struct {
    wordType string // noun, verb, interjection, intransitive verb, etc
    text     string // The actual definition itself
}

// ctxDefinition includes additional info about a definition.
type ctxDefinition struct {
    dict string // The dictionary the definition comes from
    rank uint8  // Where this definition is compared to the others
    def  definition
}

// byDictionary sorts ctxDefintions by rank and dictionary.
// Returns a map with dictionary names as keys, and definition slices as values
func byDictionary(cDs []ctxDefinition) map[string][]definition {
    pre := make(map[string][]ctxDefinition) // Used for ranking, not returned
    // Add all the defintions to the map
    for _, cD := range cDs {
        pre[cD.dict] = append(pre[cD.dict], cD)
    }
    // Sort by rank
    for k := range pre {
        sort.Slice(pre[k], func(i, j int) bool {
            return pre[k][i].rank < pre[k][j].rank
        })
    }
    // Convert to hold definitions only, not context
    m := make(map[string][]definition)
    for dict, cDs := range pre {
        for _, cD := range cDs {
            m[dict] = append(m[dict], cD.def)
        }
    }
    return m
}

// render returns a formatted definition, optionally with color.
// This contains some opinionted color defaults, as opposed to renderOps
func (d *definition) render(c bool) string {
    if c {
        return color.New(color.OpItalic).Render(d.wordType) + "\t" + d.text
    }
    return d.wordType + "\t" + d.text
}

// renderOps returns a formatted color definition, according to the provided styles.
func (d *definition) renderOps(wordType, text color.Style) string {
    return wordType.Render(d.wordType) + "\t\t" + text.Render(d.text)
}

// pprintCtxDefs pretty prints multiple context definitions, optionally with color.
func pprintCtxDefs(cDs []ctxDefinition, c bool) {
    m := byDictionary(cDs)
    w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
    //esc := string(tabwriter.Escape)
    for dict, defs := range m {
        if c {
            // Bracket dict name with escape characters so it's not part of the tabbing
            fmt.Fprintln(w, color.New(color.BgGray).Render(dict))
            // Print first definition differently
            fmt.Fprintf(w, "%s\n", defs[0].renderOps(color.New(color.OpItalic, color.OpBold), color.New(color.Cyan)))
            for _, def := range defs[1:] {
                fmt.Fprintf(w, "%s\n", def.render(true))
            }
        } else {
            fmt.Fprintf(w, dict+"\n")
            for _, def := range defs {
                fmt.Fprintf(w, "%s\n", def.render(false))
            }
        }
        fmt.Fprintln(w)
    }
    w.Flush()
}

// wordnikLookup returns a slice of ctxDefinitions for the provided word.
// Looks up words using wordnik.com
func wordnikLookup(w string, client *http.Client) ([]ctxDefinition, error) {
    req, err := http.NewRequest("GET", "https://www.wordnik.com/words/"+w, nil)
    if err != nil {
        return nil, errors.New("couldn't connect to wordnik")
    }
    req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
    resp, err := client.Do(req)
    if err != nil {
        return nil, errors.New("couldn't connect to wordnik")
    }
    defer resp.Body.Close()
    if resp.StatusCode != 200 {
        return nil, errors.New("200 not returned, likely a non-word like '../test' was passed")
    }
    doc, err := goquery.NewDocumentFromReader(resp.Body)
    if err != nil {
        return nil, errors.New("malformed HTML from wordnik")
    }
    ret := make([]ctxDefinition, 0)
    s := doc.Find(".word-module.module-definitions#define .guts.active").First()
    dicts := s.Find("h3")
    lists := s.Find("ul")
    // Go through each list of defs., then each def., and add them
    lists.Each(func(i int, list *goquery.Selection) {
        list.Find("li").Each(func(j int, def *goquery.Selection) {
            // wordType
            wT := def.Find("abbr").First().Text() + " " + def.Find("i").First().Text()
            wT = strings.TrimSpace(wT)
            // dictionary
            d := dicts.Get(i).FirstChild.Data[5:]             // strip the "from " prefix
            d = strings.ToUpper(string(d[0])) + string(d[1:]) // Capitalize first letter
            if string(d[len(d)-1]) == "." {                   // Remove ending period
                d = string(d[:len(d)-1])
            }
            // definition text - remove the wordType at the beginning of the definition
            t := strings.TrimSpace(def.Text()[len(wT):])
            t = strings.ToUpper(string(t[0])) + string(t[1:]) // Capitalize first letter
            ret = append(ret, ctxDefinition{
                dict: d,
                rank: uint8(j),
                def: definition{
                    wordType: wT,
                    text:     t,
                },
            })
        })
    })
    return ret, nil

}

type scrapRes struct {
    word string
    defs []ctxDefinition
    err  error
}

func scrapWordnik(client *http.Client, input chan string, output chan scrapRes) {
    for w := range input {
        defs, err := wordnikLookup(w, client)
        output <- scrapRes{
            word: w,
            defs: defs,
            err:  err,
        }
    }
}

func main() {
    if len(os.Args) <= 1 {
        fmt.Println("Provide a word to lookup.")
        return
    }

    words := os.Args[1:]

    // TODO: Support multiple words concurrently
    client := http.DefaultClient // prefer default http client if you are not configuring it.

    // prepare async communication pipes
    input := make(chan string)
    output := make(chan scrapRes)

    // start async workers
    var wg sync.WaitGroup
    for i := 0; i < 4; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            scrapWordnik(client, input, output)
        }()
    }
    go func() {
        wg.Wait()
        close(output)
    }()

    //feed input communication pipe
    for _, word := range words {
        input <- word
    }
    close(input)

    //read output to get results
    for r := range output {
        color.New(color.BgRed, color.White).Println(r.word)
        pprintCtxDefs(r.defs, true)
    }
}
```
代码语言:javascript
复制
票数 2
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/240071

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档