首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >分析按年份和类型分组的出版物的dblp XML和输出和

分析按年份和类型分组的出版物的dblp XML和输出和
EN

Code Review用户
提问于 2019-03-11 15:20:41
回答 2查看 422关注 0票数 6

以下Go程序解析一个gzipped XML文件(可用的这里),该文件包含有关计算机科学出版物的书目信息,并具有以下指示性结构:

代码语言:javascript
复制
<?xml version="1.0" encoding="ISO-8859-1"?>
<!DOCTYPE dblp SYSTEM "dblp.dtd">
<dblp>
    <article mdate="2017-05-28" key="journals/acta/Saxena96">
        <author>Sanjeev Saxena</author>
        <title>Parallel Integer Sorting and Simulation Amongst CRCW Models.</title>
        <pages>607-619</pages>
        <year>1996</year>
        <volume>33</volume>
        <journal>Acta Inf.</journal>
        <number>7</number>
        <url>db/journals/acta/acta33.html#Saxena96</url>
        <ee>https://doi.org/10.1007/BF03036466</ee>
    </article>
    <article mdate="2017-05-28" key="journals/acta/Simon83">
        <author>Hans Ulrich Simon</author>
        <title>Pattern Matching in Trees and Nets.</title>
        <pages>227-248</pages>
        <year>1983</year>
        <volume>20</volume>
        <journal>Acta Inf.</journal>
        <url>db/journals/acta/acta20.html#Simon83</url>
        <ee>https://doi.org/10.1007/BF01257084</ee>
    </article>
        <article mdate="2017-05-28" key="journals/acta/GoodmanS83">
        <author>Nathan Goodman</author>
        <author>Oded Shmueli</author>
        <title>NP-complete Problems Simplified on Tree Schemas.</title>
        <pages>171-178</pages>
        <year>1983</year>
        <volume>20</volume>
        <journal>Acta Inf.</journal>
        <url>db/journals/acta/acta20.html#GoodmanS83</url>
        <ee>https://doi.org/10.1007/BF00289414</ee>
    </article>
</dblp>

XML有多个发布类型,由元素的标题(即记录、书籍、phd命题)表示,我在程序中为每个发布类型定义了一个单独的结构:

代码语言:javascript
复制
package main

import (
    "compress/gzip"
    "encoding/csv"
    "encoding/xml"
    "fmt"
    "io"
    "log"
    "os"
    "sort"
    "strconv"
    "time"

    "golang.org/x/text/encoding/charmap"
)

// Dblp contains the array of articles in the dblp xml file
type Dblp struct {
    XMLName xml.Name `xml:"dblp"`
    Dblp    []Article
}

// Metadata contains the fields shared by all structs
type Metadata struct {
    Key    string `xml:"key,attr"` // not currently in use
    Year   string `xml:"year"`
    Author string `xml:"author"` // not currently in use
    Title  string `xml:"title"`  // not currently in use
}

// Article struct and the following structs contain the elements we want to parse and they "inherit" the metadata struct defined above
type Article struct {
    XMLName xml.Name `xml:"article"`
    Metadata
}

type InProceedings struct {
    XMLName xml.Name `xml:"inproceedings"`
    Metadata
}

type Proceedings struct {
    XMLName xml.Name `xml:"proceedings"`
    Metadata
}

type Book struct {
    XMLName xml.Name `xml:"book"`
    Metadata
}

type InCollection struct {
    XMLName xml.Name `xml:"incollection"`
    Metadata
}

type PhdThesis struct {
    XMLName xml.Name `xml:"phdthesis"`
    Metadata
}

type MastersThesis struct {
    XMLName xml.Name `xml:"mastersthesis"`
    Metadata
}

type WWW struct {
    XMLName xml.Name `xml:"www"`
    Metadata
}

// Record is used to store each Article's type and year which will be passed as a value to map m
type Record struct {
    UID  int
    ID   int
    Type string
    Year string
}

// SumRecord is used to store the aggregated articles by year in srMap map
//(count is stored in the map's int which is used as key)
type SumRecord struct {
    Type string
    Year string
}

该程序将每个发布存储在一个映射结构中,并最终导出两个csv文件:

  • results.csv,它包含每个出版物的id、发布类型和年份。
  • sumresults.csv,它包含每年每种出版物类型的总和。

这是我用Go编写的第一个“完整”程序--我目前正在尝试掌握这种语言,在编写这里这里时,我需要问两个关于堆栈溢出的问题。

其余代码如下:

代码语言:javascript
复制
func main() {
    // Start counting time
    start := time.Now()

    // Initialize counter variables for each publication type
    var articleCounter, InProceedingsCounter, ProceedingsCounter, BookCounter,
        InCollectionCounter, PhdThesisCounter, mastersThesisCounter, wwwCounter int
    var i = 1

    // Initialize hash map
    m := make(map[int]Record)

    //Open gzipped dblp xml
    xmlFile, err := os.Open("dblp.xml.gz")
    gz, err := gzip.NewReader(xmlFile)
    if err != nil {
        log.Fatal(err)

    }
    defer gz.Close()

    //Directly open xml file for testing purposes if needed - be sure to comment out gzip file opening above
    //xmlFile, err := os.Open("dblp.xml")
    //xmlFile, err := os.Open("TestDblp.xml")
    if err != nil {
        fmt.Println(err)
    } else {
        log.Println("Successfully Opened Dblp XML file")
    }

    // defer the closing of XML file so that we can parse it later on
    defer xmlFile.Close()

    // Initialize main object from Dblp struct
    var articles Dblp

    // Create decoder element
    decoder := xml.NewDecoder(gz)

    // Suppress xml errors
    decoder.Strict = false
    decoder.CharsetReader = makeCharsetReader
    err = decoder.Decode(&articles.Dblp)
    if err != nil {
        fmt.Println(err)
    }

    for {
        // Read tokens from the XML document in a stream.
        t, err := decoder.Token()

        // If we reach the end of the file, we are done
        if err == io.EOF {
            log.Println("XML successfully parsed:", err)
            break
        } else if err != nil {
            log.Fatalf("Error decoding token: %t", err)
        } else if t == nil {
            break
        }

        // Here, we inspect the token
        switch se := t.(type) {

        // We have the start of an element and the token we created above in t:
        case xml.StartElement:
            switch se.Name.Local {
            case "dblp":

            case "article":
                var p Article
                decoder.DecodeElement(&p, &se)
                increment(&articleCounter)
                m[i] = Record{i, articleCounter, "article", p.Year}
                increment(&i)

            case "inproceedings":
                var p InProceedings
                decoder.DecodeElement(&p, &se)
                increment(&InProceedingsCounter)
                m[i] = Record{i, InProceedingsCounter, "inproceedings", p.Year}
                increment(&i)

            case "proceedings":
                var p Proceedings
                decoder.DecodeElement(&p, &se)
                increment(&ProceedingsCounter)
                m[i] = Record{i, ProceedingsCounter, "proceedings", p.Year}
                increment(&i)

            case "book":
                var p Book
                decoder.DecodeElement(&p, &se)
                increment(&BookCounter)
                m[i] = Record{i, BookCounter, "proceedings", p.Year}
                increment(&i)

            case "incollection":
                var p InCollection
                decoder.DecodeElement(&p, &se)
                increment(&InCollectionCounter)
                m[i] = Record{i, InCollectionCounter, "incollection", p.Year}
                increment(&i)

            case "phdthesis":
                var p PhdThesis
                decoder.DecodeElement(&p, &se)
                increment(&PhdThesisCounter)
                m[i] = Record{i, PhdThesisCounter, "phdthesis", p.Year}
                increment(&i)

            case "mastersthesis":
                var p MastersThesis
                decoder.DecodeElement(&p, &se)
                increment(&mastersThesisCounter)
                m[i] = Record{i, mastersThesisCounter, "mastersthesis", p.Year}
                increment(&i)

            case "www":
                var p WWW
                decoder.DecodeElement(&p, &se)
                increment(&wwwCounter)
                m[i] = Record{i, wwwCounter, "www", p.Year}
                increment(&i)
            }
        }
    }
    log.Println("Element parsing completed in:", time.Since(start))

    // All parsed elements have been added to m := make(map[int]Record)
    // We can start processing the map.
    // First we create a map and count the number of occurences of each publication type for a given year.

    srMap := make(map[SumRecord]int)
    log.Println("Creating sums by article type per year")
    for key := range m {
        sr := SumRecord{
            Type: m[key].Type,
            Year: m[key].Year,
        }
        srMap[sr]++
    }

    //// Create sum csv
    log.Println("Creating sum results csv file")
    sumfile, err := os.Create("sumresult.csv")
    checkError("Cannot create file", err)
    defer sumfile.Close()
    sumwriter := csv.NewWriter(sumfile)
    defer sumwriter.Flush()

    // define column headers
    sumheaders := []string{
        "type",
        "year",
        "sum",
    }

    sumwriter.Write(sumheaders)
    var SumString string

    // Create sorted map by VALUE (integer)

    SortedSrMap := map[int]SumRecord{}
    SortedSrMapKeys := []int{}
    for key, val := range SortedSrMap {
        // SortedSrMap[val] = key
        // SortedSrMapKeys = append(SortedSrMapKeys, val)
        SumString = strconv.Itoa(key)
        fmt.Println("sumstring:", SumString, "value: ", val)
    }
    sort.Ints(SortedSrMapKeys)

    // END Create sorted map by VALUE (integer)

    // Export sum csv
    for key, val := range srMap {
        r := make([]string, 0, 1+len(sumheaders))
        SumString = strconv.Itoa(val)
        r = append(
            r,
            key.Type,
            key.Year,
            SumString,
        )
        sumwriter.Write(r)
    }
    sumwriter.Flush()

    // CREATE RESULTS CSV
    log.Println("Creating results csv file")
    file, err := os.Create("result.csv")
    checkError("Cannot create file", err)
    defer file.Close()
    writer := csv.NewWriter(file)
    defer writer.Flush()

    // define column headers
    headers := []string{
        "uid",
        "id",
        "type",
        "year",
    }

    // write column headers
    writer.Write(headers)

    var idString string
    var uidString string

    // Create sorted map
    var keys []int
    for k := range m {
        keys = append(keys, k)
    }
    sort.Ints(keys)

    for _, k := range keys {

        r := make([]string, 0, 1+len(headers)) // capacity of 4, 1 + the number of properties our struct has & the number of column headers we are passing

        // convert the Record.ID and UID ints to string in order to pass into append()
        idString = strconv.Itoa(m[k].ID)
        uidString = strconv.Itoa(m[k].UID)

        r = append(
            r,
            uidString,
            idString,
            m[k].Type,
            m[k].Year,
        )
        writer.Write(r)
    }
    writer.Flush()

    // END CREATE RESULTS CSV

    // Finally report results - update below line with more counters as desired
    log.Println("Articles:", articleCounter, "inproceedings", InProceedingsCounter, "proceedings:", ProceedingsCounter, "book:", BookCounter, "incollection:", InCollectionCounter, "phdthesis:", PhdThesisCounter, "mastersthesis:", mastersThesisCounter, "www:", wwwCounter)
    //log.Println("map:", m)
    //log.Println("map length:", len(m))
    //log.Println("sum map length:", len(srMap))
    //fmt.Println("sum map contents:", srMap)
    log.Println("XML parsing and csv export executed in:", time.Since(start))
}

func increment(i *int) {
    *i = *i + 1
}

func checkError(message string, err error) {
    if err != nil {
        log.Fatal(message, err)
    }
}

func makeCharsetReader(charset string, input io.Reader) (io.Reader, error) {
    if charset == "ISO-8859-1" {
        // Windows-1252 is a superset of ISO-8859-1, so it should be ok for this case
        return charmap.Windows1252.NewDecoder().Reader(input), nil
    }
    return nil, fmt.Errorf("Unknown charset: %s", charset)
}

我发现的主要问题和问题:

  • 考虑到文件的大小(474 Mb gzip),解析非常慢(大约需要3:45分钟)。我能改进一些东西使它更快吗?
  • 能不能减少代码的冗长性,而不是牺牲它的可读性/可理解性,让刚开始使用Go的人更容易理解?例如,通过泛化用于定义不同发布类型以及case / switch语句的结构?
EN

回答 2

Code Review用户

回答已采纳

发布于 2019-03-11 23:37:23

decoder.Decode调用是不必要的,实际上此时会抛出一个错误。

对于第二点,是的,特别是case语句都可以压缩成一个函数,因为它们都只有几个变量可交换。

索引到散列映射map[int]Record并不理想,事实上,这可能也会导致该表中的200万个元素的增长放缓,相反,您可以简单地将元素append到一个片段中,然后对元素进行排序,并在以后进行迭代,根本不需要排序。

而对于increment(&i) ..。继续增加计数器。如果您创建函数,好的,但这样无助于可读性(i += 1要清楚得多)。

make([]string, 0, 1+len(headers) --这是有效的,但是您可以使用所有元素创建数组,比如[]string{uidString, ..., m[k].Year等等。如果您可以在所有循环迭代中重用该数组,可能会更好。

我看不出还有什么明显的事情需要改变。有可能摆脱DecodeElement并自己解码可能会改进一些事情,但我对此表示怀疑。例如,如果我删除整个switch块,只做实质上的XML解码,这对我来说仍然需要三分钟,基本上比包含该块的时间少一分钟!也就是说,有了这个库,它的速度就不会更快了。

票数 2
EN

Code Review用户

发布于 2019-03-24 01:35:20

我已经重新检查了代码,以清理它,并在我对语言的理解方面取得进展时遵循一些建议。

要点:

现在只使用两个结构:

代码语言:javascript
复制
type Metadata struct {
    Key    string `xml:"key,attr"`
    Year   string `xml:"year"`
    Author string `xml:"author"`
    Title  string `xml:"title"`
}

type Record struct {
    UID  int
    ID   int
    Type string
    Year string
}

所有出版物都有下列功能:

代码语言:javascript
复制
func ProcessPublication(i Counter, publicationCounter Counter, publicationType string, publicationYear string, m map[int]Record) {
    m[i.Incr()] = Record{i.ReturnInt(), int(publicationCounter.Incr()), publicationType, publicationYear}
}

整个代码现在看起来如下所示:

代码语言:javascript
复制
package main

import (
    "compress/gzip"
    "encoding/csv"
    "encoding/xml"
    "fmt"
    "io"
    "log"
    "os"
    "sort"
    "strconv"
    "time"

    "golang.org/x/text/encoding/charmap"
)

// Metadata contains the fields shared by all structs
type Metadata struct {
    Key    string `xml:"key,attr"` // currently not in use
    Year   string `xml:"year"`
    Author string `xml:"author"` // currently not in use
    Title  string `xml:"title"`  // currently not in use
}

// Record is used to store each Article's type and year which will be passed as a value to map m
type Record struct {
    UID  int
    ID   int
    Type string
    Year string
}

type Count int

type Counter interface {
    Incr() int
    ReturnInt() int
}

var articleCounter, InProceedingsCounter, ProceedingsCounter, BookCounter,
    InCollectionCounter, PhdThesisCounter, mastersThesisCounter, wwwCounter, i Count

func main() {
    start := time.Now()

    //Open gzipped dblp xml
    //xmlFile, err := os.Open("TestDblp.xml.gz")
    // Uncomment below for actual xml
    xmlFile, err := os.Open("dblp.xml.gz")
    gz, err := gzip.NewReader(xmlFile)
    if err != nil {
        log.Fatal(err)

    } else {
        log.Println("Successfully Opened Dblp XML file")
    }

    defer gz.Close()

    // Create decoder element
    decoder := xml.NewDecoder(gz)

    // Suppress xml errors
    decoder.Strict = false
    decoder.CharsetReader = makeCharsetReader
    if err != nil {
        log.Fatal(err)
    }

    m := make(map[int]Record)
    var p Metadata

    for {
        // Read tokens from the XML document in a stream.
        t, err := decoder.Token()

        // If we reach the end of the file, we are done with parsing.
        if err == io.EOF {
            log.Println("XML successfully parsed:", err)
            break
        } else if err != nil {
            log.Fatalf("Error decoding token: %t", err)
        } else if t == nil {
            break
        }

        // Let's inspect the token
        switch se := t.(type) {

        // We have the start of an element and the token we created above in t:
        case xml.StartElement:
            switch se.Name.Local {

            case "article":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &articleCounter, se.Name.Local, p.Year, m)

            case "inproceedings":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &InProceedingsCounter, se.Name.Local, p.Year, m)

            case "proceedings":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &ProceedingsCounter, se.Name.Local, p.Year, m)

            case "book":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &BookCounter, se.Name.Local, p.Year, m)

            case "incollection":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &InCollectionCounter, se.Name.Local, p.Year, m)

            case "phdthesis":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &PhdThesisCounter, se.Name.Local, p.Year, m)

            case "mastersthesis":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &mastersThesisCounter, se.Name.Local, p.Year, m)

            case "www":
                decoder.DecodeElement(&p, &se)
                ProcessPublication(&i, &wwwCounter, se.Name.Local, p.Year, m)
            }
        }
    }
    log.Println("XML parsing done in:", time.Since(start))

    // All parsed elements have been added to m := make(map[int]Record)
    // We create srMap map object and count the number of occurences of each publication type for a given year.

    srMap := make(map[Record]int)
    log.Println("Creating sums by article type per year")
    for key := range m {
        sr := Record{
            Type: m[key].Type,
            Year: m[key].Year,
        }
        srMap[sr]++
    }

    // Create sumresult.csv
    log.Println("Creating sum results csv file")
    sumfile, err := os.Create("sumresult.csv")
    checkError("Cannot create file", err)
    defer sumfile.Close()

    sumwriter := csv.NewWriter(sumfile)
    defer sumwriter.Flush()

    sumheaders := []string{
        "publicationType",
        "year",
        "sum",
    }

    sumwriter.Write(sumheaders)

    // Export sumresult.csv
    for key, val := range srMap {
        r := make([]string, 0, 1+len(sumheaders))
        r = append(
            r,
            key.Type,
            key.Year,
            strconv.Itoa(val),
        )
        sumwriter.Write(r)
    }
    sumwriter.Flush()

    // Create result.csv
    log.Println("Creating result.csv")

    file, err := os.Create("result.csv")
    checkError("Cannot create file", err)
    defer file.Close()

    writer := csv.NewWriter(file)
    defer writer.Flush()

    headers := []string{
        "uid",
        "id",
        "type",
        "year",
    }

    writer.Write(headers)

    // Create sorted map
    var keys []int
    for k := range m {
        keys = append(keys, k)
    }
    sort.Ints(keys)

    for _, k := range keys {

        r := make([]string, 0, 1+len(headers))
        r = append(
            r,
            strconv.Itoa(m[k].UID),
            strconv.Itoa(m[k].ID),
            m[k].Type,
            m[k].Year,
        )
        writer.Write(r)
    }
    writer.Flush()

    // Finally report results
    log.Println("Articles:", articleCounter, "inproceedings", InProceedingsCounter, "proceedings:",
        ProceedingsCounter, "book:", BookCounter, "incollection:", InCollectionCounter, "phdthesis:",
        PhdThesisCounter, "mastersthesis:", mastersThesisCounter, "www:", wwwCounter)
    log.Println("Distinct publication map length:", len(m))
    log.Println("Sum map length:", len(srMap))
    log.Println("XML parsing and csv export executed in:", time.Since(start))
}

func checkError(message string, err error) {
    if err != nil {
        log.Fatal(message, err)
    }
}

func makeCharsetReader(charset string, input io.Reader) (io.Reader, error) {
    if charset == "ISO-8859-1" {
        // Windows-1252 is a superset of ISO-8859-1, so it should be ok for correctly decoding the dblp.xml
        return charmap.Windows1252.NewDecoder().Reader(input), nil
    }
    return nil, fmt.Errorf("Unknown charset: %s", charset)
}

func (c *Count) Incr() int {
    *c = *c + 1
    return int(*c)
}

func (c *Count) ReturnInt() int {
    return int(*c)
}

func ProcessPublication(i Counter, publicationCounter Counter, publicationType string, publicationYear string, m map[int]Record) {
    m[i.Incr()] = Record{i.ReturnInt(), int(publicationCounter.Incr()), publicationType, publicationYear}
}

我觉得csv一代的部件可以进一步精简,因为它们仍然有点凌乱。

票数 0
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/215203

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档