Unicode and String Encoding in Go

Unicode is fundamental to modern text processing. Go has excellent Unicode support built-in. This guide covers Unicode concepts and practical string encoding techniques. For more context, see Go Installation Guide, Go Ecosystem Overview, Go Best Practices.

Unicode Fundamentals

Understanding Runes and Bytes

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// ASCII string
	ascii := "Hello"
	fmt.Printf("ASCII: %q\n", ascii)
	fmt.Printf("Length (bytes): %d\n", len(ascii))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(ascii))

	// Unicode string
	unicode := "Hello 世界"
	fmt.Printf("\nUnicode: %q\n", unicode)
	fmt.Printf("Length (bytes): %d\n", len(unicode))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(unicode))

	// Emoji
	emoji := "Hello 👋"
	fmt.Printf("\nEmoji: %q\n", emoji)
	fmt.Printf("Length (bytes): %d\n", len(emoji))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(emoji))
}

Iterating Over Runes

package main

import (
	"fmt"
)

func main() {
	text := "Hello 世界"

	// Iterate by rune (correct)
	fmt.Println("By rune:")
	for i, r := range text {
		fmt.Printf("Index %d: %c (U+%04X)\n", i, r, r)
	}

	// Iterate by byte (wrong for Unicode)
	fmt.Println("\nBy byte:")
	for i := 0; i < len(text); i++ {
		fmt.Printf("Index %d: %d\n", i, text[i])
	}
}

UTF-8 Encoding

UTF-8 Basics

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// UTF-8 encoding
	text := "A"
	fmt.Printf("'%s' = %v (1 byte)\n", text, []byte(text))

	text = "é"
	fmt.Printf("'%s' = %v (2 bytes)\n", text, []byte(text))

	text = "中"
	fmt.Printf("'%s' = %v (3 bytes)\n", text, []byte(text))

	text = "😀"
	fmt.Printf("'%s' = %v (4 bytes)\n", text, []byte(text))
}

Encoding and Decoding Runes

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Encode rune to UTF-8
	r := '世'
	buf := make([]byte, utf8.UTFMax)
	n := utf8.EncodeRune(buf, r)
	fmt.Printf("Encoded '%c': %v (size: %d)\n", r, buf[:n], n)

	// Decode rune from UTF-8
	bytes := []byte("世")
	r, size := utf8.DecodeRune(bytes)
	fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)

	// Decode last rune
	r, size = utf8.DecodeLastRune(bytes)
	fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}

String Validation

Validating UTF-8

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Valid UTF-8
	validStr := "Hello 世界"
	fmt.Printf("'%s' is valid UTF-8: %v\n", validStr, utf8.ValidString(validStr))

	// Valid bytes
	validBytes := []byte("Hello")
	fmt.Printf("Bytes are valid UTF-8: %v\n", utf8.Valid(validBytes))

	// Invalid UTF-8 (simulated)
	invalidBytes := []byte{0xFF, 0xFE}
	fmt.Printf("Invalid bytes are valid UTF-8: %v\n", utf8.Valid(invalidBytes))
}

Character Classification

Unicode Properties

package main

import (
	"fmt"
	"unicode"
)

func main() {
	tests := []rune{'A', 'a', '5', ' ', '中', '😀'}

	for _, r := range tests {
		fmt.Printf("'%c' (U+%04X):\n", r, r)
		fmt.Printf("  IsLetter: %v\n", unicode.IsLetter(r))
		fmt.Printf("  IsDigit: %v\n", unicode.IsDigit(r))
		fmt.Printf("  IsSpace: %v\n", unicode.IsSpace(r))
		fmt.Printf("  IsUpper: %v\n", unicode.IsUpper(r))
		fmt.Printf("  IsLower: %v\n", unicode.IsLower(r))
		fmt.Printf("  Category: %s\n", unicode.Category(r))
		fmt.Println()
	}
}

Case Conversion

Unicode Case Operations

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func main() {
	text := "Hello World 世界"

	// String case conversion
	fmt.Printf("Original: %s\n", text)
	fmt.Printf("Upper: %s\n", strings.ToUpper(text))
	fmt.Printf("Lower: %s\n", strings.ToLower(text))
	fmt.Printf("Title: %s\n", strings.Title(text))

	// Rune case conversion
	fmt.Println("\nRune conversion:")
	for _, r := range "Café" {
		fmt.Printf("'%c' -> Upper: '%c', Lower: '%c'\n", r, unicode.ToUpper(r), unicode.ToLower(r))
	}
}

Normalization

Unicode Normalization

package main

import (
	"fmt"
	"unicode"
	"unicode/norm"
)

func main() {
	// Composed form (é as single character)
	composed := "café"

	// Decomposed form (e + accent)
	decomposed := "cafe\u0301"

	fmt.Printf("Composed: %q\n", composed)
	fmt.Printf("Decomposed: %q\n", decomposed)
	fmt.Printf("Are they equal? %v\n", composed == decomposed)

	// Normalize to NFC (composed)
	nfc := norm.NFC.String(decomposed)
	fmt.Printf("Normalized (NFC): %q\n", nfc)
	fmt.Printf("Equal after normalization? %v\n", composed == nfc)

	// Normalize to NFD (decomposed)
	nfd := norm.NFD.String(composed)
	fmt.Printf("Normalized (NFD): %q\n", nfd)
}

Practical Examples

Counting Characters Correctly

package main

import (
	"fmt"
	"unicode/utf8"
)

func countCharacters(text string) map[string]int {
	return map[string]int{
		"bytes": len(text),
		"runes": utf8.RuneCountInString(text),
	}
}

func main() {
	tests := []string{
		"Hello",
		"Hello 世界",
		"🎉🎊🎈",
		"Café",
	}

	for _, text := range tests {
		counts := countCharacters(text)
		fmt.Printf("'%s': %d bytes, %d characters\n", text, counts["bytes"], counts["runes"])
	}
}

Substring Operations

package main

import (
	"fmt"
)

func main() {
	text := "Hello 世界"

	// Wrong: byte indexing
	fmt.Printf("text[0:5] = %q (wrong for Unicode)\n", text[0:5])

	// Correct: rune indexing
	runes := []rune(text)
	fmt.Printf("runes[0:5] = %q\n", string(runes[0:5]))
	fmt.Printf("runes[6:8] = %q\n", string(runes[6:8]))
}

Text Truncation

package main

import (
	"fmt"
	"unicode/utf8"
)

func truncateString(text string, maxRunes int) string {
	runes := []rune(text)
	if len(runes) <= maxRunes {
		return text
	}

	return string(runes[:maxRunes]) + "..."
}

func main() {
	tests := []string{
		"Hello World",
		"Hello 世界",
		"🎉🎊🎈🎁",
	}

	for _, text := range tests {
		truncated := truncateString(text, 5)
		fmt.Printf("'%s' -> '%s'\n", text, truncated)
	}
}

Grapheme Clustering

package main

import (
	"fmt"
	"unicode"
)

func isGraphemeCluster(r rune) bool {
	// Simplified: check if combining mark
	return unicode.Is(unicode.Mark, r)
}

func splitGraphemes(text string) []string {
	var graphemes []string
	var current string

	for _, r := range text {
		if isGraphemeCluster(r) {
			current += string(r)
		} else {
			if current != "" {
				graphemes = append(graphemes, current)
			}
			current = string(r)
		}
	}

	if current != "" {
		graphemes = append(graphemes, current)
	}

	return graphemes
}

func main() {
	text := "e\u0301"  // é as e + combining acute
	graphemes := splitGraphemes(text)

	fmt.Printf("Text: %q\n", text)
	fmt.Printf("Graphemes: %v\n", graphemes)
}

Best Practices

✅ Good Practices

// Use runes for character operations
runes := []rune(text)
for i, r := range runes {
	// Process character
}

// Use utf8.RuneCountInString for character count
count := utf8.RuneCountInString(text)

// Validate UTF-8
if !utf8.ValidString(text) {
	// Handle invalid UTF-8
}

// Use unicode package for properties
if unicode.IsLetter(r) {
	// Process letter
}

// Normalize when comparing
normalized := norm.NFC.String(text)

❌ Anti-Patterns

// Don't use len() for character count
count := len(text) // Wrong for Unicode!

// Don't index strings directly
char := text[5] // May split multi-byte character

// Don't assume ASCII
// Always consider Unicode

// Don't ignore encoding
// Always validate UTF-8

Resources

Summary

Unicode and string encoding in Go:

Understand the difference between bytes and runes
Use runes for character operations
Use utf8.RuneCountInString for character count
Validate UTF-8 input
Use unicode package for character properties
Normalize text when comparing
Handle internationalization properly

With these techniques, you can correctly handle Unicode text in Go applications.