Parsing and Tokenization in Go
Parsing and tokenization are fundamental techniques for processing structured text. This guide covers building lexers, parsers, and handling various data formats in Go.
Tokenization Basics
Simple Tokenizer
package main
import (
"fmt"
"strings"
"unicode"
)
type TokenType int
const (
TokenEOF TokenType = iota
TokenNumber
TokenOperator
TokenIdentifier
TokenWhitespace
)
type Token struct {
Type TokenType
Value string
}
func tokenize(input string) []Token {
var tokens []Token
var current string
for _, ch := range input {
if unicode.IsDigit(ch) {
current += string(ch)
} else if unicode.IsLetter(ch) {
if current != "" && unicode.IsDigit(rune(current[0])) {
tokens = append(tokens, Token{TokenNumber, current})
current = ""
}
current += string(ch)
} else if strings.ContainsRune("+-*/", ch) {
if current != "" {
if unicode.IsDigit(rune(current[0])) {
tokens = append(tokens, Token{TokenNumber, current})
} else {
tokens = append(tokens, Token{TokenIdentifier, current})
}
current = ""
}
tokens = append(tokens, Token{TokenOperator, string(ch)})
} else if unicode.IsSpace(ch) {
if current != "" {
if unicode.IsDigit(rune(current[0])) {
tokens = append(tokens, Token{TokenNumber, current})
} else {
tokens = append(tokens, Token{TokenIdentifier, current})
}
current = ""
}
}
}
if current != "" {
if unicode.IsDigit(rune(current[0])) {
tokens = append(tokens, Token{TokenNumber, current})
} else {
tokens = append(tokens, Token{TokenIdentifier, current})
}
}
return tokens
}
func main() {
input := "x + 42 * y"
tokens := tokenize(input)
for _, token := range tokens {
fmt.Printf("Type: %d, Value: %s\n", token.Type, token.Value)
}
}
Building a Lexer
Lexer Implementation
package main
import (
"fmt"
"strings"
"unicode"
)
type Lexer struct {
input string
pos int
current rune
}
func NewLexer(input string) *Lexer {
l := &Lexer{input: input, pos: 0}
if len(input) > 0 {
l.current = rune(input[0])
}
return l
}
func (l *Lexer) advance() {
l.pos++
if l.pos >= len(l.input) {
l.current = 0
} else {
l.current = rune(l.input[l.pos])
}
}
func (l *Lexer) peek() rune {
if l.pos+1 >= len(l.input) {
return 0
}
return rune(l.input[l.pos+1])
}
func (l *Lexer) skipWhitespace() {
for unicode.IsSpace(l.current) {
l.advance()
}
}
func (l *Lexer) readNumber() string {
var num string
for unicode.IsDigit(l.current) {
num += string(l.current)
l.advance()
}
return num
}
func (l *Lexer) readIdentifier() string {
var ident string
for unicode.IsLetter(l.current) || unicode.IsDigit(l.current) || l.current == '_' {
ident += string(l.current)
l.advance()
}
return ident
}
func (l *Lexer) NextToken() Token {
l.skipWhitespace()
if l.current == 0 {
return Token{TokenEOF, ""}
}
if unicode.IsDigit(l.current) {
return Token{TokenNumber, l.readNumber()}
}
if unicode.IsLetter(l.current) {
return Token{TokenIdentifier, l.readIdentifier()}
}
if strings.ContainsRune("+-*/", l.current) {
token := Token{TokenOperator, string(l.current)}
l.advance()
return token
}
l.advance()
return Token{TokenEOF, ""}
}
func main() {
lexer := NewLexer("x + 42 * y")
for {
token := lexer.NextToken()
if token.Type == TokenEOF {
break
}
fmt.Printf("Type: %d, Value: %s\n", token.Type, token.Value)
}
}
Building a Parser
Simple Expression Parser
package main
import (
"fmt"
"strconv"
)
type Parser struct {
lexer *Lexer
current Token
}
func NewParser(input string) *Parser {
lexer := NewLexer(input)
p := &Parser{lexer: lexer}
p.current = lexer.NextToken()
return p
}
func (p *Parser) advance() {
p.current = p.lexer.NextToken()
}
func (p *Parser) parseNumber() int {
num, _ := strconv.Atoi(p.current.Value)
p.advance()
return num
}
func (p *Parser) parseExpression() int {
result := p.parseTerm()
for p.current.Type == TokenOperator && (p.current.Value == "+" || p.current.Value == "-") {
op := p.current.Value
p.advance()
right := p.parseTerm()
if op == "+" {
result += right
} else {
result -= right
}
}
return result
}
func (p *Parser) parseTerm() int {
result := p.parseFactor()
for p.current.Type == TokenOperator && (p.current.Value == "*" || p.current.Value == "/") {
op := p.current.Value
p.advance()
right := p.parseFactor()
if op == "*" {
result *= right
} else {
result /= right
}
}
return result
}
func (p *Parser) parseFactor() int {
if p.current.Type == TokenNumber {
return p.parseNumber()
}
return 0
}
func main() {
parser := NewParser("2 + 3 * 4")
result := parser.parseExpression()
fmt.Println("Result:", result) // Output: 14
}
Practical Parsing Examples
CSV Parser
package main
import (
"fmt"
"strings"
)
func parseCSV(line string) []string {
var fields []string
var current string
inQuotes := false
for _, ch := range line {
if ch == '"' {
inQuotes = !inQuotes
} else if ch == ',' && !inQuotes {
fields = append(fields, strings.TrimSpace(current))
current = ""
} else {
current += string(ch)
}
}
fields = append(fields, strings.TrimSpace(current))
return fields
}
func main() {
line := `"John Doe", 30, "[email protected]"`
fields := parseCSV(line)
for i, field := range fields {
fmt.Printf("Field %d: %s\n", i, field)
}
}
JSON-like Parser
package main
import (
"fmt"
"strings"
"unicode"
)
type JSONValue interface{}
type JSONParser struct {
input string
pos int
}
func NewJSONParser(input string) *JSONParser {
return &JSONParser{input: input, pos: 0}
}
func (p *JSONParser) skipWhitespace() {
for p.pos < len(p.input) && unicode.IsSpace(rune(p.input[p.pos])) {
p.pos++
}
}
func (p *JSONParser) parseString() string {
p.pos++ // Skip opening quote
var result string
for p.pos < len(p.input) && p.input[p.pos] != '"' {
result += string(p.input[p.pos])
p.pos++
}
p.pos++ // Skip closing quote
return result
}
func (p *JSONParser) parseValue() JSONValue {
p.skipWhitespace()
if p.input[p.pos] == '"' {
return p.parseString()
}
return nil
}
func main() {
parser := NewJSONParser(`"Hello, World!"`)
value := parser.parseValue()
fmt.Println("Parsed:", value)
}
Configuration File Parser
package main
import (
"fmt"
"strings"
)
type Config map[string]string
func parseConfig(content string) Config {
config := make(Config)
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
// Skip comments and empty lines
if line == "" || strings.HasPrefix(line, "#") {
continue
}
// Parse key=value
parts := strings.SplitN(line, "=", 2)
if len(parts) == 2 {
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
config[key] = value
}
}
return config
}
func main() {
content := `
# Configuration file
host = localhost
port = 8080
debug = true
`
config := parseConfig(content)
for key, value := range config {
fmt.Printf("%s: %s\n", key, value)
}
}
Best Practices
โ Good Practices
// Use lexer for tokenization
lexer := NewLexer(input)
// Use parser for syntax analysis
parser := NewParser(input)
// Handle errors properly
if err != nil {
// Handle error
}
// Use recursive descent for simple grammars
// Use more advanced techniques for complex grammars
// Test with various inputs
// Include edge cases
โ Anti-Patterns
// Don't parse without tokenizing
// Tokenization simplifies parsing
// Don't ignore errors
// Always handle parsing errors
// Don't use regex for complex parsing
// Use proper parsers instead
// Don't hardcode parsing logic
// Use structured approach
Resources
Summary
Parsing and tokenization are essential for text processing:
- Use lexers for tokenization
- Use parsers for syntax analysis
- Implement recursive descent for simple grammars
- Handle errors properly
- Test with various inputs
- Use appropriate techniques for complexity
With these techniques, you can build robust parsers for various data formats in Go.
Comments