Unicode is fundamental to modern text processing. Go has excellent Unicode support built-in. This guide covers Unicode concepts and practical string encoding techniques. For more context, see Go Installation Guide, Go Ecosystem Overview, Go Best Practices.
Unicode Fundamentals
Understanding Runes and Bytes
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// ASCII string
ascii := "Hello"
fmt.Printf("ASCII: %q\n", ascii)
fmt.Printf("Length (bytes): %d\n", len(ascii))
fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(ascii))
// Unicode string
unicode := "Hello 世界"
fmt.Printf("\nUnicode: %q\n", unicode)
fmt.Printf("Length (bytes): %d\n", len(unicode))
fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(unicode))
// Emoji
emoji := "Hello 👋"
fmt.Printf("\nEmoji: %q\n", emoji)
fmt.Printf("Length (bytes): %d\n", len(emoji))
fmt.Printf("Length (runes): %d\n", utf8.RuneCountInString(emoji))
}
Iterating Over Runes
package main
import (
"fmt"
)
func main() {
text := "Hello 世界"
// Iterate by rune (correct)
fmt.Println("By rune:")
for i, r := range text {
fmt.Printf("Index %d: %c (U+%04X)\n", i, r, r)
}
// Iterate by byte (wrong for Unicode)
fmt.Println("\nBy byte:")
for i := 0; i < len(text); i++ {
fmt.Printf("Index %d: %d\n", i, text[i])
}
}
UTF-8 Encoding
UTF-8 Basics
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// UTF-8 encoding
text := "A"
fmt.Printf("'%s' = %v (1 byte)\n", text, []byte(text))
text = "é"
fmt.Printf("'%s' = %v (2 bytes)\n", text, []byte(text))
text = "中"
fmt.Printf("'%s' = %v (3 bytes)\n", text, []byte(text))
text = "😀"
fmt.Printf("'%s' = %v (4 bytes)\n", text, []byte(text))
}
Encoding and Decoding Runes
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// Encode rune to UTF-8
r := '世'
buf := make([]byte, utf8.UTFMax)
n := utf8.EncodeRune(buf, r)
fmt.Printf("Encoded '%c': %v (size: %d)\n", r, buf[:n], n)
// Decode rune from UTF-8
bytes := []byte("世")
r, size := utf8.DecodeRune(bytes)
fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)
// Decode last rune
r, size = utf8.DecodeLastRune(bytes)
fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}
String Validation
Validating UTF-8
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// Valid UTF-8
validStr := "Hello 世界"
fmt.Printf("'%s' is valid UTF-8: %v\n", validStr, utf8.ValidString(validStr))
// Valid bytes
validBytes := []byte("Hello")
fmt.Printf("Bytes are valid UTF-8: %v\n", utf8.Valid(validBytes))
// Invalid UTF-8 (simulated)
invalidBytes := []byte{0xFF, 0xFE}
fmt.Printf("Invalid bytes are valid UTF-8: %v\n", utf8.Valid(invalidBytes))
}
Character Classification
Unicode Properties
package main
import (
"fmt"
"unicode"
)
func main() {
tests := []rune{'A', 'a', '5', ' ', '中', '😀'}
for _, r := range tests {
fmt.Printf("'%c' (U+%04X):\n", r, r)
fmt.Printf(" IsLetter: %v\n", unicode.IsLetter(r))
fmt.Printf(" IsDigit: %v\n", unicode.IsDigit(r))
fmt.Printf(" IsSpace: %v\n", unicode.IsSpace(r))
fmt.Printf(" IsUpper: %v\n", unicode.IsUpper(r))
fmt.Printf(" IsLower: %v\n", unicode.IsLower(r))
fmt.Printf(" Category: %s\n", unicode.Category(r))
fmt.Println()
}
}
Case Conversion
Unicode Case Operations
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
text := "Hello World 世界"
// String case conversion
fmt.Printf("Original: %s\n", text)
fmt.Printf("Upper: %s\n", strings.ToUpper(text))
fmt.Printf("Lower: %s\n", strings.ToLower(text))
fmt.Printf("Title: %s\n", strings.Title(text))
// Rune case conversion
fmt.Println("\nRune conversion:")
for _, r := range "Café" {
fmt.Printf("'%c' -> Upper: '%c', Lower: '%c'\n", r, unicode.ToUpper(r), unicode.ToLower(r))
}
}
Normalization
Unicode Normalization
package main
import (
"fmt"
"unicode"
"unicode/norm"
)
func main() {
// Composed form (é as single character)
composed := "café"
// Decomposed form (e + accent)
decomposed := "cafe\u0301"
fmt.Printf("Composed: %q\n", composed)
fmt.Printf("Decomposed: %q\n", decomposed)
fmt.Printf("Are they equal? %v\n", composed == decomposed)
// Normalize to NFC (composed)
nfc := norm.NFC.String(decomposed)
fmt.Printf("Normalized (NFC): %q\n", nfc)
fmt.Printf("Equal after normalization? %v\n", composed == nfc)
// Normalize to NFD (decomposed)
nfd := norm.NFD.String(composed)
fmt.Printf("Normalized (NFD): %q\n", nfd)
}
Practical Examples
Counting Characters Correctly
package main
import (
"fmt"
"unicode/utf8"
)
func countCharacters(text string) map[string]int {
return map[string]int{
"bytes": len(text),
"runes": utf8.RuneCountInString(text),
}
}
func main() {
tests := []string{
"Hello",
"Hello 世界",
"🎉🎊🎈",
"Café",
}
for _, text := range tests {
counts := countCharacters(text)
fmt.Printf("'%s': %d bytes, %d characters\n", text, counts["bytes"], counts["runes"])
}
}
Substring Operations
package main
import (
"fmt"
)
func main() {
text := "Hello 世界"
// Wrong: byte indexing
fmt.Printf("text[0:5] = %q (wrong for Unicode)\n", text[0:5])
// Correct: rune indexing
runes := []rune(text)
fmt.Printf("runes[0:5] = %q\n", string(runes[0:5]))
fmt.Printf("runes[6:8] = %q\n", string(runes[6:8]))
}
Text Truncation
package main
import (
"fmt"
"unicode/utf8"
)
func truncateString(text string, maxRunes int) string {
runes := []rune(text)
if len(runes) <= maxRunes {
return text
}
return string(runes[:maxRunes]) + "..."
}
func main() {
tests := []string{
"Hello World",
"Hello 世界",
"🎉🎊🎈🎁",
}
for _, text := range tests {
truncated := truncateString(text, 5)
fmt.Printf("'%s' -> '%s'\n", text, truncated)
}
}
Grapheme Clustering
package main
import (
"fmt"
"unicode"
)
func isGraphemeCluster(r rune) bool {
// Simplified: check if combining mark
return unicode.Is(unicode.Mark, r)
}
func splitGraphemes(text string) []string {
var graphemes []string
var current string
for _, r := range text {
if isGraphemeCluster(r) {
current += string(r)
} else {
if current != "" {
graphemes = append(graphemes, current)
}
current = string(r)
}
}
if current != "" {
graphemes = append(graphemes, current)
}
return graphemes
}
func main() {
text := "e\u0301" // é as e + combining acute
graphemes := splitGraphemes(text)
fmt.Printf("Text: %q\n", text)
fmt.Printf("Graphemes: %v\n", graphemes)
}
Best Practices
✅ Good Practices
// Use runes for character operations
runes := []rune(text)
for i, r := range runes {
// Process character
}
// Use utf8.RuneCountInString for character count
count := utf8.RuneCountInString(text)
// Validate UTF-8
if !utf8.ValidString(text) {
// Handle invalid UTF-8
}
// Use unicode package for properties
if unicode.IsLetter(r) {
// Process letter
}
// Normalize when comparing
normalized := norm.NFC.String(text)
❌ Anti-Patterns
// Don't use len() for character count
count := len(text) // Wrong for Unicode!
// Don't index strings directly
char := text[5] // May split multi-byte character
// Don't assume ASCII
// Always consider Unicode
// Don't ignore encoding
// Always validate UTF-8
Resources
- Go unicode Package Documentation
- Go unicode/utf8 Package Documentation
- Go unicode/norm Package Documentation
- UTF-8 Specification
- Unicode Standard
Summary
Unicode and string encoding in Go:
- Understand the difference between bytes and runes
- Use runes for character operations
- Use utf8.RuneCountInString for character count
- Validate UTF-8 input
- Use unicode package for character properties
- Normalize text when comparing
- Handle internationalization properly
With these techniques, you can correctly handle Unicode text in Go applications.
Comments