Skip to main content
โšก Calmops

Unicode and String Encoding in Go

Unicode and String Encoding in Go

Unicode is fundamental to modern text processing. Go has excellent Unicode support built-in. This guide covers Unicode concepts and practical string encoding techniques.

Unicode Fundamentals

Understanding Runes and Bytes

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// ASCII string
	ascii := "Hello"
	fmt.Printf("ASCII: %q\n", ascii)
	fmt.Printf("Length (bytes): %d\n", len(ascii))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(ascii))

	// Unicode string
	unicode := "Hello ไธ–็•Œ"
	fmt.Printf("\nUnicode: %q\n", unicode)
	fmt.Printf("Length (bytes): %d\n", len(unicode))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(unicode))

	// Emoji
	emoji := "Hello ๐Ÿ‘‹"
	fmt.Printf("\nEmoji: %q\n", emoji)
	fmt.Printf("Length (bytes): %d\n", len(emoji))
	fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(emoji))
}

Iterating Over Runes

package main

import (
	"fmt"
)

func main() {
	text := "Hello ไธ–็•Œ"

	// Iterate by rune (correct)
	fmt.Println("By rune:")
	for i, r := range text {
		fmt.Printf("Index %d: %c (U+%04X)\n", i, r, r)
	}

	// Iterate by byte (wrong for Unicode)
	fmt.Println("\nBy byte:")
	for i := 0; i < len(text); i++ {
		fmt.Printf("Index %d: %d\n", i, text[i])
	}
}

UTF-8 Encoding

UTF-8 Basics

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// UTF-8 encoding
	text := "A"
	fmt.Printf("'%s' = %v (1 byte)\n", text, []byte(text))

	text = "รฉ"
	fmt.Printf("'%s' = %v (2 bytes)\n", text, []byte(text))

	text = "ไธญ"
	fmt.Printf("'%s' = %v (3 bytes)\n", text, []byte(text))

	text = "๐Ÿ˜€"
	fmt.Printf("'%s' = %v (4 bytes)\n", text, []byte(text))
}

Encoding and Decoding Runes

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Encode rune to UTF-8
	r := 'ไธ–'
	buf := make([]byte, utf8.UTFMax)
	n := utf8.EncodeRune(buf, r)
	fmt.Printf("Encoded '%c': %v (size: %d)\n", r, buf[:n], n)

	// Decode rune from UTF-8
	bytes := []byte("ไธ–")
	r, size := utf8.DecodeRune(bytes)
	fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)

	// Decode last rune
	r, size = utf8.DecodeLastRune(bytes)
	fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}

String Validation

Validating UTF-8

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Valid UTF-8
	validStr := "Hello ไธ–็•Œ"
	fmt.Printf("'%s' is valid UTF-8: %v\n", validStr, utf8.ValidString(validStr))

	// Valid bytes
	validBytes := []byte("Hello")
	fmt.Printf("Bytes are valid UTF-8: %v\n", utf8.Valid(validBytes))

	// Invalid UTF-8 (simulated)
	invalidBytes := []byte{0xFF, 0xFE}
	fmt.Printf("Invalid bytes are valid UTF-8: %v\n", utf8.Valid(invalidBytes))
}

Character Classification

Unicode Properties

package main

import (
	"fmt"
	"unicode"
)

func main() {
	tests := []rune{'A', 'a', '5', ' ', 'ไธญ', '๐Ÿ˜€'}

	for _, r := range tests {
		fmt.Printf("'%c' (U+%04X):\n", r, r)
		fmt.Printf("  IsLetter: %v\n", unicode.IsLetter(r))
		fmt.Printf("  IsDigit: %v\n", unicode.IsDigit(r))
		fmt.Printf("  IsSpace: %v\n", unicode.IsSpace(r))
		fmt.Printf("  IsUpper: %v\n", unicode.IsUpper(r))
		fmt.Printf("  IsLower: %v\n", unicode.IsLower(r))
		fmt.Printf("  Category: %s\n", unicode.Category(r))
		fmt.Println()
	}
}

Case Conversion

Unicode Case Operations

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func main() {
	text := "Hello World ไธ–็•Œ"

	// String case conversion
	fmt.Printf("Original: %s\n", text)
	fmt.Printf("Upper: %s\n", strings.ToUpper(text))
	fmt.Printf("Lower: %s\n", strings.ToLower(text))
	fmt.Printf("Title: %s\n", strings.Title(text))

	// Rune case conversion
	fmt.Println("\nRune conversion:")
	for _, r := range "Cafรฉ" {
		fmt.Printf("'%c' -> Upper: '%c', Lower: '%c'\n", r, unicode.ToUpper(r), unicode.ToLower(r))
	}
}

Normalization

Unicode Normalization

package main

import (
	"fmt"
	"unicode"
	"unicode/norm"
)

func main() {
	// Composed form (รฉ as single character)
	composed := "cafรฉ"

	// Decomposed form (e + accent)
	decomposed := "cafe\u0301"

	fmt.Printf("Composed: %q\n", composed)
	fmt.Printf("Decomposed: %q\n", decomposed)
	fmt.Printf("Are they equal? %v\n", composed == decomposed)

	// Normalize to NFC (composed)
	nfc := norm.NFC.String(decomposed)
	fmt.Printf("Normalized (NFC): %q\n", nfc)
	fmt.Printf("Equal after normalization? %v\n", composed == nfc)

	// Normalize to NFD (decomposed)
	nfd := norm.NFD.String(composed)
	fmt.Printf("Normalized (NFD): %q\n", nfd)
}

Practical Examples

Counting Characters Correctly

package main

import (
	"fmt"
	"unicode/utf8"
)

func countCharacters(text string) map[string]int {
	return map[string]int{
		"bytes": len(text),
		"runes": utf8.RuneCountInString(text),
	}
}

func main() {
	tests := []string{
		"Hello",
		"Hello ไธ–็•Œ",
		"๐ŸŽ‰๐ŸŽŠ๐ŸŽˆ",
		"Cafรฉ",
	}

	for _, text := range tests {
		counts := countCharacters(text)
		fmt.Printf("'%s': %d bytes, %d characters\n", text, counts["bytes"], counts["runes"])
	}
}

Substring Operations

package main

import (
	"fmt"
)

func main() {
	text := "Hello ไธ–็•Œ"

	// Wrong: byte indexing
	fmt.Printf("text[0:5] = %q (wrong for Unicode)\n", text[0:5])

	// Correct: rune indexing
	runes := []rune(text)
	fmt.Printf("runes[0:5] = %q\n", string(runes[0:5]))
	fmt.Printf("runes[6:8] = %q\n", string(runes[6:8]))
}

Text Truncation

package main

import (
	"fmt"
	"unicode/utf8"
)

func truncateString(text string, maxRunes int) string {
	runes := []rune(text)
	if len(runes) <= maxRunes {
		return text
	}

	return string(runes[:maxRunes]) + "..."
}

func main() {
	tests := []string{
		"Hello World",
		"Hello ไธ–็•Œ",
		"๐ŸŽ‰๐ŸŽŠ๐ŸŽˆ๐ŸŽ",
	}

	for _, text := range tests {
		truncated := truncateString(text, 5)
		fmt.Printf("'%s' -> '%s'\n", text, truncated)
	}
}

Grapheme Clustering

package main

import (
	"fmt"
	"unicode"
)

func isGraphemeCluster(r rune) bool {
	// Simplified: check if combining mark
	return unicode.Is(unicode.Mark, r)
}

func splitGraphemes(text string) []string {
	var graphemes []string
	var current string

	for _, r := range text {
		if isGraphemeCluster(r) {
			current += string(r)
		} else {
			if current != "" {
				graphemes = append(graphemes, current)
			}
			current = string(r)
		}
	}

	if current != "" {
		graphemes = append(graphemes, current)
	}

	return graphemes
}

func main() {
	text := "e\u0301"  // รฉ as e + combining acute
	graphemes := splitGraphemes(text)

	fmt.Printf("Text: %q\n", text)
	fmt.Printf("Graphemes: %v\n", graphemes)
}

Best Practices

โœ… Good Practices

// Use runes for character operations
runes := []rune(text)
for i, r := range runes {
	// Process character
}

// Use utf8.RuneCountInString for character count
count := utf8.RuneCountInString(text)

// Validate UTF-8
if !utf8.ValidString(text) {
	// Handle invalid UTF-8
}

// Use unicode package for properties
if unicode.IsLetter(r) {
	// Process letter
}

// Normalize when comparing
normalized := norm.NFC.String(text)

โŒ Anti-Patterns

// Don't use len() for character count
count := len(text) // Wrong for Unicode!

// Don't index strings directly
char := text[5] // May split multi-byte character

// Don't assume ASCII
// Always consider Unicode

// Don't ignore encoding
// Always validate UTF-8

Resources

Summary

Unicode and string encoding in Go:

  • Understand the difference between bytes and runes
  • Use runes for character operations
  • Use utf8.RuneCountInString for character count
  • Validate UTF-8 input
  • Use unicode package for character properties
  • Normalize text when comparing
  • Handle internationalization properly

With these techniques, you can correctly handle Unicode text in Go applications.

Comments