Unicode and String Encoding in Go
Unicode is fundamental to modern text processing. Go has excellent Unicode support built-in. This guide covers Unicode concepts and practical string encoding techniques.
Unicode Fundamentals
Understanding Runes and Bytes
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// ASCII string
ascii := "Hello"
fmt.Printf("ASCII: %q\n", ascii)
fmt.Printf("Length (bytes): %d\n", len(ascii))
fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(ascii))
// Unicode string
unicode := "Hello ไธ็"
fmt.Printf("\nUnicode: %q\n", unicode)
fmt.Printf("Length (bytes): %d\n", len(unicode))
fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(unicode))
// Emoji
emoji := "Hello ๐"
fmt.Printf("\nEmoji: %q\n", emoji)
fmt.Printf("Length (bytes): %d\n", len(emoji))
fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(emoji))
}
Iterating Over Runes
package main
import (
"fmt"
)
func main() {
text := "Hello ไธ็"
// Iterate by rune (correct)
fmt.Println("By rune:")
for i, r := range text {
fmt.Printf("Index %d: %c (U+%04X)\n", i, r, r)
}
// Iterate by byte (wrong for Unicode)
fmt.Println("\nBy byte:")
for i := 0; i < len(text); i++ {
fmt.Printf("Index %d: %d\n", i, text[i])
}
}
UTF-8 Encoding
UTF-8 Basics
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// UTF-8 encoding
text := "A"
fmt.Printf("'%s' = %v (1 byte)\n", text, []byte(text))
text = "รฉ"
fmt.Printf("'%s' = %v (2 bytes)\n", text, []byte(text))
text = "ไธญ"
fmt.Printf("'%s' = %v (3 bytes)\n", text, []byte(text))
text = "๐"
fmt.Printf("'%s' = %v (4 bytes)\n", text, []byte(text))
}
Encoding and Decoding Runes
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// Encode rune to UTF-8
r := 'ไธ'
buf := make([]byte, utf8.UTFMax)
n := utf8.EncodeRune(buf, r)
fmt.Printf("Encoded '%c': %v (size: %d)\n", r, buf[:n], n)
// Decode rune from UTF-8
bytes := []byte("ไธ")
r, size := utf8.DecodeRune(bytes)
fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)
// Decode last rune
r, size = utf8.DecodeLastRune(bytes)
fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}
String Validation
Validating UTF-8
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// Valid UTF-8
validStr := "Hello ไธ็"
fmt.Printf("'%s' is valid UTF-8: %v\n", validStr, utf8.ValidString(validStr))
// Valid bytes
validBytes := []byte("Hello")
fmt.Printf("Bytes are valid UTF-8: %v\n", utf8.Valid(validBytes))
// Invalid UTF-8 (simulated)
invalidBytes := []byte{0xFF, 0xFE}
fmt.Printf("Invalid bytes are valid UTF-8: %v\n", utf8.Valid(invalidBytes))
}
Character Classification
Unicode Properties
package main
import (
"fmt"
"unicode"
)
func main() {
tests := []rune{'A', 'a', '5', ' ', 'ไธญ', '๐'}
for _, r := range tests {
fmt.Printf("'%c' (U+%04X):\n", r, r)
fmt.Printf(" IsLetter: %v\n", unicode.IsLetter(r))
fmt.Printf(" IsDigit: %v\n", unicode.IsDigit(r))
fmt.Printf(" IsSpace: %v\n", unicode.IsSpace(r))
fmt.Printf(" IsUpper: %v\n", unicode.IsUpper(r))
fmt.Printf(" IsLower: %v\n", unicode.IsLower(r))
fmt.Printf(" Category: %s\n", unicode.Category(r))
fmt.Println()
}
}
Case Conversion
Unicode Case Operations
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
text := "Hello World ไธ็"
// String case conversion
fmt.Printf("Original: %s\n", text)
fmt.Printf("Upper: %s\n", strings.ToUpper(text))
fmt.Printf("Lower: %s\n", strings.ToLower(text))
fmt.Printf("Title: %s\n", strings.Title(text))
// Rune case conversion
fmt.Println("\nRune conversion:")
for _, r := range "Cafรฉ" {
fmt.Printf("'%c' -> Upper: '%c', Lower: '%c'\n", r, unicode.ToUpper(r), unicode.ToLower(r))
}
}
Normalization
Unicode Normalization
package main
import (
"fmt"
"unicode"
"unicode/norm"
)
func main() {
// Composed form (รฉ as single character)
composed := "cafรฉ"
// Decomposed form (e + accent)
decomposed := "cafe\u0301"
fmt.Printf("Composed: %q\n", composed)
fmt.Printf("Decomposed: %q\n", decomposed)
fmt.Printf("Are they equal? %v\n", composed == decomposed)
// Normalize to NFC (composed)
nfc := norm.NFC.String(decomposed)
fmt.Printf("Normalized (NFC): %q\n", nfc)
fmt.Printf("Equal after normalization? %v\n", composed == nfc)
// Normalize to NFD (decomposed)
nfd := norm.NFD.String(composed)
fmt.Printf("Normalized (NFD): %q\n", nfd)
}
Practical Examples
Counting Characters Correctly
package main
import (
"fmt"
"unicode/utf8"
)
func countCharacters(text string) map[string]int {
return map[string]int{
"bytes": len(text),
"runes": utf8.RuneCountInString(text),
}
}
func main() {
tests := []string{
"Hello",
"Hello ไธ็",
"๐๐๐",
"Cafรฉ",
}
for _, text := range tests {
counts := countCharacters(text)
fmt.Printf("'%s': %d bytes, %d characters\n", text, counts["bytes"], counts["runes"])
}
}
Substring Operations
package main
import (
"fmt"
)
func main() {
text := "Hello ไธ็"
// Wrong: byte indexing
fmt.Printf("text[0:5] = %q (wrong for Unicode)\n", text[0:5])
// Correct: rune indexing
runes := []rune(text)
fmt.Printf("runes[0:5] = %q\n", string(runes[0:5]))
fmt.Printf("runes[6:8] = %q\n", string(runes[6:8]))
}
Text Truncation
package main
import (
"fmt"
"unicode/utf8"
)
func truncateString(text string, maxRunes int) string {
runes := []rune(text)
if len(runes) <= maxRunes {
return text
}
return string(runes[:maxRunes]) + "..."
}
func main() {
tests := []string{
"Hello World",
"Hello ไธ็",
"๐๐๐๐",
}
for _, text := range tests {
truncated := truncateString(text, 5)
fmt.Printf("'%s' -> '%s'\n", text, truncated)
}
}
Grapheme Clustering
package main
import (
"fmt"
"unicode"
)
func isGraphemeCluster(r rune) bool {
// Simplified: check if combining mark
return unicode.Is(unicode.Mark, r)
}
func splitGraphemes(text string) []string {
var graphemes []string
var current string
for _, r := range text {
if isGraphemeCluster(r) {
current += string(r)
} else {
if current != "" {
graphemes = append(graphemes, current)
}
current = string(r)
}
}
if current != "" {
graphemes = append(graphemes, current)
}
return graphemes
}
func main() {
text := "e\u0301" // รฉ as e + combining acute
graphemes := splitGraphemes(text)
fmt.Printf("Text: %q\n", text)
fmt.Printf("Graphemes: %v\n", graphemes)
}
Best Practices
โ Good Practices
// Use runes for character operations
runes := []rune(text)
for i, r := range runes {
// Process character
}
// Use utf8.RuneCountInString for character count
count := utf8.RuneCountInString(text)
// Validate UTF-8
if !utf8.ValidString(text) {
// Handle invalid UTF-8
}
// Use unicode package for properties
if unicode.IsLetter(r) {
// Process letter
}
// Normalize when comparing
normalized := norm.NFC.String(text)
โ Anti-Patterns
// Don't use len() for character count
count := len(text) // Wrong for Unicode!
// Don't index strings directly
char := text[5] // May split multi-byte character
// Don't assume ASCII
// Always consider Unicode
// Don't ignore encoding
// Always validate UTF-8
Resources
- Go unicode Package Documentation
- Go unicode/utf8 Package Documentation
- Go unicode/norm Package Documentation
- UTF-8 Specification
- Unicode Standard
Summary
Unicode and string encoding in Go:
- Understand the difference between bytes and runes
- Use runes for character operations
- Use utf8.RuneCountInString for character count
- Validate UTF-8 input
- Use unicode package for character properties
- Normalize text when comparing
- Handle internationalization properly
With these techniques, you can correctly handle Unicode text in Go applications.
Comments