Parsing and Tokenization in Go

Parsing and tokenization are fundamental techniques for processing structured text. This guide covers building lexers, parsers, and handling various data formats in Go.

Tokenization Basics

Simple Tokenizer

package main

import (
	"fmt"
	"strings"
	"unicode"
)

type TokenType int

const (
	TokenEOF TokenType = iota
	TokenNumber
	TokenOperator
	TokenIdentifier
	TokenWhitespace
)

type Token struct {
	Type  TokenType
	Value string
}

func tokenize(input string) []Token {
	var tokens []Token
	var current string

	for _, ch := range input {
		if unicode.IsDigit(ch) {
			current += string(ch)
		} else if unicode.IsLetter(ch) {
			if current != "" && unicode.IsDigit(rune(current[0])) {
				tokens = append(tokens, Token{TokenNumber, current})
				current = ""
			}
			current += string(ch)
		} else if strings.ContainsRune("+-*/", ch) {
			if current != "" {
				if unicode.IsDigit(rune(current[0])) {
					tokens = append(tokens, Token{TokenNumber, current})
				} else {
					tokens = append(tokens, Token{TokenIdentifier, current})
				}
				current = ""
			}
			tokens = append(tokens, Token{TokenOperator, string(ch)})
		} else if unicode.IsSpace(ch) {
			if current != "" {
				if unicode.IsDigit(rune(current[0])) {
					tokens = append(tokens, Token{TokenNumber, current})
				} else {
					tokens = append(tokens, Token{TokenIdentifier, current})
				}
				current = ""
			}
		}
	}

	if current != "" {
		if unicode.IsDigit(rune(current[0])) {
			tokens = append(tokens, Token{TokenNumber, current})
		} else {
			tokens = append(tokens, Token{TokenIdentifier, current})
		}
	}

	return tokens
}

func main() {
	input := "x + 42 * y"
	tokens := tokenize(input)

	for _, token := range tokens {
		fmt.Printf("Type: %d, Value: %s\n", token.Type, token.Value)
	}
}

Building a Lexer

Lexer Implementation

package main

import (
	"fmt"
	"strings"
	"unicode"
)

type Lexer struct {
	input  string
	pos    int
	current rune
}

func NewLexer(input string) *Lexer {
	l := &Lexer{input: input, pos: 0}
	if len(input) > 0 {
		l.current = rune(input[0])
	}
	return l
}

func (l *Lexer) advance() {
	l.pos++
	if l.pos >= len(l.input) {
		l.current = 0
	} else {
		l.current = rune(l.input[l.pos])
	}
}

func (l *Lexer) peek() rune {
	if l.pos+1 >= len(l.input) {
		return 0
	}
	return rune(l.input[l.pos+1])
}

func (l *Lexer) skipWhitespace() {
	for unicode.IsSpace(l.current) {
		l.advance()
	}
}

func (l *Lexer) readNumber() string {
	var num string
	for unicode.IsDigit(l.current) {
		num += string(l.current)
		l.advance()
	}
	return num
}

func (l *Lexer) readIdentifier() string {
	var ident string
	for unicode.IsLetter(l.current) || unicode.IsDigit(l.current) || l.current == '_' {
		ident += string(l.current)
		l.advance()
	}
	return ident
}

func (l *Lexer) NextToken() Token {
	l.skipWhitespace()

	if l.current == 0 {
		return Token{TokenEOF, ""}
	}

	if unicode.IsDigit(l.current) {
		return Token{TokenNumber, l.readNumber()}
	}

	if unicode.IsLetter(l.current) {
		return Token{TokenIdentifier, l.readIdentifier()}
	}

	if strings.ContainsRune("+-*/", l.current) {
		token := Token{TokenOperator, string(l.current)}
		l.advance()
		return token
	}

	l.advance()
	return Token{TokenEOF, ""}
}

func main() {
	lexer := NewLexer("x + 42 * y")

	for {
		token := lexer.NextToken()
		if token.Type == TokenEOF {
			break
		}
		fmt.Printf("Type: %d, Value: %s\n", token.Type, token.Value)
	}
}

Building a Parser

Simple Expression Parser

package main

import (
	"fmt"
	"strconv"
)

type Parser struct {
	lexer  *Lexer
	current Token
}

func NewParser(input string) *Parser {
	lexer := NewLexer(input)
	p := &Parser{lexer: lexer}
	p.current = lexer.NextToken()
	return p
}

func (p *Parser) advance() {
	p.current = p.lexer.NextToken()
}

func (p *Parser) parseNumber() int {
	num, _ := strconv.Atoi(p.current.Value)
	p.advance()
	return num
}

func (p *Parser) parseExpression() int {
	result := p.parseTerm()

	for p.current.Type == TokenOperator && (p.current.Value == "+" || p.current.Value == "-") {
		op := p.current.Value
		p.advance()
		right := p.parseTerm()

		if op == "+" {
			result += right
		} else {
			result -= right
		}
	}

	return result
}

func (p *Parser) parseTerm() int {
	result := p.parseFactor()

	for p.current.Type == TokenOperator && (p.current.Value == "*" || p.current.Value == "/") {
		op := p.current.Value
		p.advance()
		right := p.parseFactor()

		if op == "*" {
			result *= right
		} else {
			result /= right
		}
	}

	return result
}

func (p *Parser) parseFactor() int {
	if p.current.Type == TokenNumber {
		return p.parseNumber()
	}
	return 0
}

func main() {
	parser := NewParser("2 + 3 * 4")
	result := parser.parseExpression()
	fmt.Println("Result:", result) // Output: 14
}

Practical Parsing Examples

CSV Parser

package main

import (
	"fmt"
	"strings"
)

func parseCSV(line string) []string {
	var fields []string
	var current string
	inQuotes := false

	for _, ch := range line {
		if ch == '"' {
			inQuotes = !inQuotes
		} else if ch == ',' && !inQuotes {
			fields = append(fields, strings.TrimSpace(current))
			current = ""
		} else {
			current += string(ch)
		}
	}

	fields = append(fields, strings.TrimSpace(current))
	return fields
}

func main() {
	line := `"John Doe", 30, "[email protected]"`
	fields := parseCSV(line)

	for i, field := range fields {
		fmt.Printf("Field %d: %s\n", i, field)
	}
}

JSON-like Parser

package main

import (
	"fmt"
	"strings"
	"unicode"
)

type JSONValue interface{}

type JSONParser struct {
	input string
	pos   int
}

func NewJSONParser(input string) *JSONParser {
	return &JSONParser{input: input, pos: 0}
}

func (p *JSONParser) skipWhitespace() {
	for p.pos < len(p.input) && unicode.IsSpace(rune(p.input[p.pos])) {
		p.pos++
	}
}

func (p *JSONParser) parseString() string {
	p.pos++ // Skip opening quote
	var result string

	for p.pos < len(p.input) && p.input[p.pos] != '"' {
		result += string(p.input[p.pos])
		p.pos++
	}

	p.pos++ // Skip closing quote
	return result
}

func (p *JSONParser) parseValue() JSONValue {
	p.skipWhitespace()

	if p.input[p.pos] == '"' {
		return p.parseString()
	}

	return nil
}

func main() {
	parser := NewJSONParser(`"Hello, World!"`)
	value := parser.parseValue()
	fmt.Println("Parsed:", value)
}

Configuration File Parser

package main

import (
	"fmt"
	"strings"
)

type Config map[string]string

func parseConfig(content string) Config {
	config := make(Config)

	for _, line := range strings.Split(content, "\n") {
		line = strings.TrimSpace(line)

		// Skip comments and empty lines
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}

		// Parse key=value
		parts := strings.SplitN(line, "=", 2)
		if len(parts) == 2 {
			key := strings.TrimSpace(parts[0])
			value := strings.TrimSpace(parts[1])
			config[key] = value
		}
	}

	return config
}

func main() {
	content := `
# Configuration file
host = localhost
port = 8080
debug = true
	`

	config := parseConfig(content)

	for key, value := range config {
		fmt.Printf("%s: %s\n", key, value)
	}
}

Best Practices

✅ Good Practices

// Use lexer for tokenization
lexer := NewLexer(input)

// Use parser for syntax analysis
parser := NewParser(input)

// Handle errors properly
if err != nil {
	// Handle error
}

// Use recursive descent for simple grammars
// Use more advanced techniques for complex grammars

// Test with various inputs
// Include edge cases

❌ Anti-Patterns

// Don't parse without tokenizing
// Tokenization simplifies parsing

// Don't ignore errors
// Always handle parsing errors

// Don't use regex for complex parsing
// Use proper parsers instead

// Don't hardcode parsing logic
// Use structured approach

Resources

Summary

Parsing and tokenization are essential for text processing:

Use lexers for tokenization
Use parsers for syntax analysis
Implement recursive descent for simple grammars
Handle errors properly
Test with various inputs
Use appropriate techniques for complexity

With these techniques, you can build robust parsers for various data formats in Go.