How to get both the chardata and the value of the attributes of an XML tag when decoding it in Golang

Issue

My XML file resembles to something like this:

<page>
    <title>Antoine Meillet</title>
    <ns>0</ns>
    <id>3</id>
    <revision>
      <id>178204512</id>
      <parentid>178097574</parentid>
      <timestamp>2020-12-30T10:12:14Z</timestamp>
      <contributor>
        <username>Rovo</username>
        <id>34820</id>
      </contributor>
      <minor />
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="11274" xml:space="preserve">
        a lot of text
      </text>
      <sha1>ikqy1f9ppwo8eo38a0hh817eynr40vg</sha1>
    </revision>
  </page>

My goal is to filter out a lot of those tags and only keep the page tag and those inner tags: title, id, text.

So far, I have been able to successfully extract the page tag with title and id having the right value.
This is what I get:

<page>
 <title>Antoine Meillet</title>
 <id>3</id>
 <text bytes="0" xml:space=""></text>
</page>
<page>
 <title>Algèbre linéaire</title>
 <id>7</id>
 <text bytes="0" xml:space=""></text>
</page>

So the problem here as you can see is that the text tag doesn’t have the right values for its attributes and the absence of text in it.

I have achieved this using this piece of code:

package main

import (
    "encoding/xml"
    "fmt"
    "io"
    "os"
)

type Page struct {
    XMLName xml.Name `xml:"page"`
    Title   string   `xml:"title"`
    Id      int64    `xml:"id"`
    Text    struct {
        Key   float32 `xml:"bytes,attr"`
        Space string  `xml:"xml:space,attr"`
    } `xml:"text"`
}

func main() {
    frwikiXML, err := os.Open("frwiki10000.xml")
    if err != nil {
        fmt.Println(err)
    }
    cleanedWikiXML, err := os.Create("cleaned_fr_wiki.xml")
    if err != nil {
        fmt.Println(err)
    }

    cleanXMLEncoder := xml.NewEncoder(cleanedWikiXML)
    cleanXMLEncoder.Indent("", " ")

    frwikiDecoder := xml.NewDecoder(frwikiXML)
    for {
        t, tokenErr := frwikiDecoder.Token()
        if tokenErr != nil {
            if tokenErr == io.EOF {
                break
            }
            fmt.Errorf("decoding token: %w", tokenErr)
        }
        switch t := t.(type) {
        case xml.StartElement:
            if t.Name.Local == "page" {
                var page Page
                if err := frwikiDecoder.DecodeElement(&page, &t); err != nil {
                    fmt.Errorf("decoding element %q: %v", t.Name.Local, err)
                }
                fmt.Println("Element was decoded successfully.")
                fmt.Printf("Page title: %v\n Page id: %d\n", page.Title, page.Id)
                fmt.Printf("Text: %v", page.Text)
                cleanXMLEncoder.Encode(page)
            }
        }
    }

    defer frwikiXML.Close()
    defer cleanedWikiXML.Close()
}

How would I be able to solve this problem, please?

Thanks.

Solution

To parse huge file xml file, use the standard xml Decoder.

Call Token to read tokens one by one. When a start element with required name is found ("page"), call DecodeElement to decode the element and prepare result to next actions.

type Page struct {
    XMLName  xml.Name `xml:"page"`
    Title    string   `xml:"title"`
    Id       int64    `xml:"id"`
    Revision struct {
        Text struct {
            Key   float32 `xml:"bytes,attr"`
            Space string  `xml:"xml:space,attr"`
        } `xml:"text"`
    } `xml:"revision"`
}

type PageTarget struct {
    XMLName xml.Name `xml:"page"`
    Title   string   `xml:"title"`
    Id      int64    `xml:"id"`
    Text    struct {
        Key   float32 `xml:"bytes,attr"`
        Space string  `xml:"xml:space,attr"`
    } `xml:"text"`
}
    dec := xml.NewDecoder(strings.NewReader(sample))
    
loop:
    for {
        tok, err := dec.Token()
        switch {
        case err != nil && err != io.EOF:
            panic(err)
        case err == io.EOF:
            break loop
        case tok == nil:
            fmt.Println("token is nill")

        }

        switch se := tok.(type) {
        case xml.StartElement:
            if se.Name.Local == "page" {
                var page Page
                if err := dec.DecodeElement(&page, &se); err != nil {
                    panic(err)
                }

                target := PageTarget{
                    XMLName: page.XMLName,
                    Id:      page.Id,
                    Title:   page.Title,
                    Text:    page.Revision.Text,
                }

                out, err := xml.MarshalIndent(target, " ", "  ")
                if err != nil {
                    panic(err)
                }
                fmt.Println(string(out))
            }
        }
    }

PLAYGROUND

Answered By – kozmo

Answer Checked By – Willingham (GoLangFix Volunteer)

Leave a Reply

Your email address will not be published.