Unicode and String Encoding in Go

Unicode is fundamental to modern text processing. Go has excellent Unicode support built-in. This guide covers Unicode concepts and practical string encoding techniques.

Unicode Fundamentals

Understanding Runes and Bytes

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// ASCII string
	ascii := "Hello"
	fmt.Printf("ASCII: %q\n", ascii)
	fmt.Printf("Length (bytes): %d\n", len(ascii))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(ascii))

	// Unicode string
	unicode := "Hello 世界"
	fmt.Printf("\nUnicode: %q\n", unicode)
	fmt.Printf("Length (bytes): %d\n", len(unicode))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(unicode))

	// Emoji
	emoji := "Hello 👋"
	fmt.Printf("\nEmoji: %q\n", emoji)
	fmt.Printf("Length (bytes): %d\n", len(emoji))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(emoji))
}

Iterating Over Runes

package main

import (
	"fmt"
)

func main() {
	text := "Hello 世界"

	// Iterate by rune (correct)
	fmt.Println("By rune:")
	for i, r := range text {
		fmt.Printf("Index %d: %c (U+%04X)\n", i, r, r)
	}

	// Iterate by byte (wrong for Unicode)
	fmt.Println("\nBy byte:")
	for i := 0; i < len(text); i++ {
		fmt.Printf("Index %d: %d\n", i, text[i])
	}
}

UTF-8 Encoding

UTF-8 Basics

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// UTF-8 encoding
	text := "A"
	fmt.Printf("'%s' = %v (1 byte)\n", text, []byte(text))

	text = "é"
	fmt.Printf("'%s' = %v (2 bytes)\n", text, []byte(text))

	text = "中"
	fmt.Printf("'%s' = %v (3 bytes)\n", text, []byte(text))

	text = "😀"
	fmt.Printf("'%s' = %v (4 bytes)\n", text, []byte(text))
}

Encoding and Decoding Runes

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Encode rune to UTF-8
	r := '世'
	buf := make([]byte, utf8.UTFMax)
	n := utf8.EncodeRune(buf, r)
	fmt.Printf("Encoded '%c': %v (size: %d)\n", r, buf[:n], n)

	// Decode rune from UTF-8
	bytes := []byte("世")
	r, size := utf8.DecodeRune(bytes)
	fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)

	// Decode last rune
	r, size = utf8.DecodeLastRune(bytes)
	fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}

String Validation

Validating UTF-8

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Valid UTF-8
	validStr := "Hello 世界"
	fmt.Printf("'%s' is valid UTF-8: %v\n", validStr, utf8.ValidString(validStr))

	// Valid bytes
	validBytes := []byte("Hello")
	fmt.Printf("Bytes are valid UTF-8: %v\n", utf8.Valid(validBytes))

	// Invalid UTF-8 (simulated)
	invalidBytes := []byte{0xFF, 0xFE}
	fmt.Printf("Invalid bytes are valid UTF-8: %v\n", utf8.Valid(invalidBytes))
}

Character Classification

Unicode Properties

package main

import (
	"fmt"
	"unicode"
)

func main() {
	tests := []rune{'A', 'a', '5', ' ', '中', '😀'}

	for _, r := range tests {
		fmt.Printf("'%c' (U+%04X):\n", r, r)
		fmt.Printf("  IsLetter: %v\n", unicode.IsLetter(r))
		fmt.Printf("  IsDigit: %v\n", unicode.IsDigit(r))
		fmt.Printf("  IsSpace: %v\n", unicode.IsSpace(r))
		fmt.Printf("  IsUpper: %v\n", unicode.IsUpper(r))
		fmt.Printf("  IsLower: %v\n", unicode.IsLower(r))
		fmt.Printf("  Category: %s\n", unicode.Category(r))
		fmt.Println()
	}
}

Case Conversion

Unicode Case Operations

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func main() {
	text := "Hello World 世界"

	// String case conversion
	fmt.Printf("Original: %s\n", text)
	fmt.Printf("Upper: %s\n", strings.ToUpper(text))
	fmt.Printf("Lower: %s\n", strings.ToLower(text))
	fmt.Printf("Title: %s\n", strings.Title(text))

	// Rune case conversion
	fmt.Println("\nRune conversion:")
	for _, r := range "Café" {
		fmt.Printf("'%c' -> Upper: '%c', Lower: '%c'\n", r, unicode.ToUpper(r), unicode.ToLower(r))
	}
}

Normalization

Unicode Normalization

package main

import (
	"fmt"
	"unicode"
	"unicode/norm"
)

func main() {
	// Composed form (é as single character)
	composed := "café"

	// Decomposed form (e + accent)
	decomposed := "cafe\u0301"

	fmt.Printf("Composed: %q\n", composed)
	fmt.Printf("Decomposed: %q\n", decomposed)
	fmt.Printf("Are they equal? %v\n", composed == decomposed)

	// Normalize to NFC (composed)
	nfc := norm.NFC.String(decomposed)
	fmt.Printf("Normalized (NFC): %q\n", nfc)
	fmt.Printf("Equal after normalization? %v\n", composed == nfc)

	// Normalize to NFD (decomposed)
	nfd := norm.NFD.String(composed)
	fmt.Printf("Normalized (NFD): %q\n", nfd)
}

Practical Examples

Counting Characters Correctly

package main

import (
	"fmt"
	"unicode/utf8"
)

func countCharacters(text string) map[string]int {
	return map[string]int{
		"bytes": len(text),
		"runes": utf8.RuneCountInString(text),
	}
}

func main() {
	tests := []string{
		"Hello",
		"Hello 世界",
		"🎉🎊🎈",
		"Café",
	}

	for _, text := range tests {
		counts := countCharacters(text)
		fmt.Printf("'%s': %d bytes, %d characters\n", text, counts["bytes"], counts["runes"])
	}
}

Substring Operations

package main

import (
	"fmt"
)

func main() {
	text := "Hello 世界"

	// Wrong: byte indexing
	fmt.Printf("text[0:5] = %q (wrong for Unicode)\n", text[0:5])

	// Correct: rune indexing
	runes := []rune(text)
	fmt.Printf("runes[0:5] = %q\n", string(runes[0:5]))
	fmt.Printf("runes[6:8] = %q\n", string(runes[6:8]))
}

Text Truncation

package main

import (
	"fmt"
	"unicode/utf8"
)

func truncateString(text string, maxRunes int) string {
	runes := []rune(text)
	if len(runes) <= maxRunes {
		return text
	}

	return string(runes[:maxRunes]) + "..."
}

func main() {
	tests := []string{
		"Hello World",
		"Hello 世界",
		"🎉🎊🎈🎁",
	}

	for _, text := range tests {
		truncated := truncateString(text, 5)
		fmt.Printf("'%s' -> '%s'\n", text, truncated)
	}
}

Grapheme Clustering

package main

import (
	"fmt"
	"unicode"
)

func isGraphemeCluster(r rune) bool {
	// Simplified: check if combining mark
	return unicode.Is(unicode.Mark, r)
}

func splitGraphemes(text string) []string {
	var graphemes []string
	var current string

	for _, r := range text {
		if isGraphemeCluster(r) {
			current += string(r)
		} else {
			if current != "" {
				graphemes = append(graphemes, current)
			}
			current = string(r)
		}
	}

	if current != "" {
		graphemes = append(graphemes, current)
	}

	return graphemes
}

func main() {
	text := "e\u0301"  // é as e + combining acute
	graphemes := splitGraphemes(text)

	fmt.Printf("Text: %q\n", text)
	fmt.Printf("Graphemes: %v\n", graphemes)
}

Best Practices

✅ Good Practices

// Use runes for character operations
runes := []rune(text)
for i, r := range runes {
	// Process character
}

// Use utf8.RuneCountInString for character count
count := utf8.RuneCountInString(text)

// Validate UTF-8
if !utf8.ValidString(text) {
	// Handle invalid UTF-8
}

// Use unicode package for properties
if unicode.IsLetter(r) {
	// Process letter
}

// Normalize when comparing
normalized := norm.NFC.String(text)

❌ Anti-Patterns

// Don't use len() for character count
count := len(text) // Wrong for Unicode!

// Don't index strings directly
char := text[5] // May split multi-byte character

// Don't assume ASCII
// Always consider Unicode

// Don't ignore encoding
// Always validate UTF-8

Resources

Summary

Unicode and string encoding in Go:

Understand the difference between bytes and runes
Use runes for character operations
Use utf8.RuneCountInString for character count
Validate UTF-8 input
Use unicode package for character properties
Normalize text when comparing
Handle internationalization properly

With these techniques, you can correctly handle Unicode text in Go applications.