Understanding how Go handles bytes, runes, and Unicode is essential for working with text and strings. Go has excellent Unicode support built-in, but it’s important to understand the distinctions between bytes, runes, and strings. This guide covers everything you need to know. For more context, see Go Installation Guide, Go Ecosystem Overview, Go Best Practices.
Understanding Bytes and Runes
Bytes vs Runes
package main
import (
"fmt"
)
func main() {
// String with ASCII characters
str := "Hello"
fmt.Println("String:", str)
fmt.Println("Length:", len(str))
fmt.Println("Bytes:", []byte(str))
// String with Unicode characters
str2 := "Hello 世界"
fmt.Println("\nString:", str2)
fmt.Println("Length (bytes):", len(str2))
fmt.Println("Length (runes):", len([]rune(str2)))
fmt.Println("Bytes:", []byte(str2))
fmt.Println("Runes:", []rune(str2))
}
Iterating Over Strings
package main
import (
"fmt"
)
func main() {
str := "Hello 世界"
// Iterate by byte (wrong for Unicode)
fmt.Println("By byte:")
for i := 0; i < len(str); i++ {
fmt.Printf("Index %d: %c (byte: %d)\n", i, str[i], str[i])
}
// Iterate by rune (correct for Unicode)
fmt.Println("\nBy rune:")
for i, r := range str {
fmt.Printf("Index %d: %c (rune: %U)\n", i, r, r)
}
// Convert to runes for indexed access
fmt.Println("\nUsing rune slice:")
runes := []rune(str)
for i, r := range runes {
fmt.Printf("Index %d: %c (rune: %U)\n", i, r, r)
}
}
Working with Bytes
Byte Slices
package main
import (
"fmt"
)
func main() {
// Create byte slice from string
str := "Hello"
bytes := []byte(str)
fmt.Println("Bytes:", bytes)
// Modify bytes
bytes[0] = 'J'
fmt.Println("Modified:", string(bytes))
// Create byte slice directly
data := []byte{72, 101, 108, 108, 111}
fmt.Println("From bytes:", string(data))
// Byte literals
var b byte = 'A'
fmt.Printf("Byte: %c (value: %d)\n", b, b)
}
Byte Operations
package main
import (
"bytes"
"fmt"
)
func main() {
// Compare byte slices
b1 := []byte("hello")
b2 := []byte("hello")
b3 := []byte("world")
fmt.Println("b1 == b2:", bytes.Equal(b1, b2))
fmt.Println("b1 == b3:", bytes.Equal(b1, b3))
// Contains
fmt.Println("Contains 'ell':", bytes.Contains(b1, []byte("ell")))
// Index
fmt.Println("Index of 'l':", bytes.Index(b1, []byte("l")))
// Count
fmt.Println("Count 'l':", bytes.Count(b1, []byte("l")))
// Replace
result := bytes.Replace(b1, []byte("l"), []byte("L"), -1)
fmt.Println("Replace:", string(result))
}
Working with Runes
Rune Basics
package main
import (
"fmt"
"unicode"
)
func main() {
// Rune is an alias for int32
var r rune = 'A'
fmt.Printf("Rune: %c (value: %d, Unicode: %U)\n", r, r, r)
// Unicode runes
r2 := '世'
fmt.Printf("Rune: %c (value: %d, Unicode: %U)\n", r2, r2, r2)
// Rune from string
str := "Hello"
for _, r := range str {
fmt.Printf("%c: %U\n", r, r)
}
// Check rune properties
fmt.Println("\nRune properties:")
fmt.Println("IsLetter('A'):", unicode.IsLetter('A'))
fmt.Println("IsDigit('5'):", unicode.IsDigit('5'))
fmt.Println("IsSpace(' '):", unicode.IsSpace(' '))
fmt.Println("IsUpper('A'):", unicode.IsUpper('A'))
fmt.Println("IsLower('a'):", unicode.IsLower('a'))
}
Rune Conversion
package main
import (
"fmt"
"unicode"
)
func main() {
// Convert case
r := 'a'
fmt.Printf("'%c' to upper: '%c'\n", r, unicode.ToUpper(r))
r = 'A'
fmt.Printf("'%c' to lower: '%c'\n", r, unicode.ToLower(r))
// Convert to title case
r = 'a'
fmt.Printf("'%c' to title: '%c'\n", r, unicode.ToTitle(r))
// Get rune category
fmt.Println("\nRune categories:")
fmt.Println("Category of 'A':", unicode.Category('A'))
fmt.Println("Category of '5':", unicode.Category('5'))
fmt.Println("Category of ' ':", unicode.Category(' '))
}
Unicode and UTF-8
UTF-8 Encoding
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// UTF-8 encoding
str := "Hello 世界"
fmt.Println("String:", str)
fmt.Println("Byte length:", len(str))
fmt.Println("Rune length:", utf8.RuneCountInString(str))
// Encode rune to UTF-8
r := '世'
buf := make([]byte, utf8.UTFMax)
n := utf8.EncodeRune(buf, r)
fmt.Printf("Encoded '%c': %v (length: %d)\n", r, buf[:n], n)
// Decode rune from UTF-8
bytes := []byte("世")
r, size := utf8.DecodeRune(bytes)
fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)
// Decode last rune
r, size = utf8.DecodeLastRune(bytes)
fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}
Validating UTF-8
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// Valid UTF-8
validStr := "Hello 世界"
fmt.Println("Valid UTF-8:", utf8.ValidString(validStr))
// Invalid UTF-8 (simulated)
invalidBytes := []byte{0xFF, 0xFE}
fmt.Println("Valid bytes:", utf8.Valid(invalidBytes))
// Check if string is valid
if utf8.ValidString(validStr) {
fmt.Println("String is valid UTF-8")
}
}
Text Processing
String Manipulation with Runes
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
// Reverse string
str := "Hello 世界"
runes := []rune(str)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
runes[i], runes[j] = runes[j], runes[i]
}
fmt.Println("Reversed:", string(runes))
// Convert to uppercase
fmt.Println("Uppercase:", strings.ToUpper(str))
// Convert to lowercase
fmt.Println("Lowercase:", strings.ToLower(str))
// Title case
fmt.Println("Title:", strings.Title(str))
}
Character Classification
package main
import (
"fmt"
"unicode"
)
func main() {
str := "Hello123!@#"
// Classify characters
var letters, digits, spaces, others int
for _, r := range str {
switch {
case unicode.IsLetter(r):
letters++
case unicode.IsDigit(r):
digits++
case unicode.IsSpace(r):
spaces++
default:
others++
}
}
fmt.Printf("Letters: %d, Digits: %d, Spaces: %d, Others: %d\n",
letters, digits, spaces, others)
}
Filtering Characters
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
str := "Hello123World!@#"
// Keep only letters
letters := strings.Map(func(r rune) rune {
if unicode.IsLetter(r) {
return r
}
return -1
}, str)
fmt.Println("Letters only:", letters)
// Keep only alphanumeric
alphanumeric := strings.Map(func(r rune) rune {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
return r
}
return -1
}, str)
fmt.Println("Alphanumeric:", alphanumeric)
// Remove spaces
noSpaces := strings.Map(func(r rune) rune {
if unicode.IsSpace(r) {
return -1
}
return r
}, str)
fmt.Println("No spaces:", noSpaces)
}
Practical Examples
Counting Unicode Characters
package main
import (
"fmt"
"unicode/utf8"
)
func countCharacters(str string) map[string]int {
return map[string]int{
"bytes": len(str),
"runes": utf8.RuneCountInString(str),
}
}
func main() {
tests := []string{
"Hello",
"Hello 世界",
"🎉🎊🎈",
"Café",
}
for _, str := range tests {
counts := countCharacters(str)
fmt.Printf("'%s': %d bytes, %d runes\n", str, counts["bytes"], counts["runes"])
}
}
Substring Operations with Unicode
package main
import (
"fmt"
)
func main() {
str := "Hello 世界"
// Get substring by rune index
runes := []rune(str)
fmt.Println("Full string:", str)
fmt.Println("Substring [0:5]:", string(runes[0:5]))
fmt.Println("Substring [6:8]:", string(runes[6:8]))
// Get first N characters
fmt.Println("First 5 chars:", string(runes[:5]))
// Get last N characters
fmt.Println("Last 2 chars:", string(runes[len(runes)-2:]))
}
Text Normalization
package main
import (
"fmt"
"strings"
"unicode"
)
func normalizeText(str string) string {
// Convert to lowercase and trim spaces
str = strings.ToLower(str)
str = strings.TrimSpace(str)
// Remove extra spaces
str = strings.Join(strings.Fields(str), " ")
return str
}
func main() {
tests := []string{
" Hello World ",
"HELLO WORLD",
"HeLLo WoRLd",
}
for _, str := range tests {
fmt.Printf("'%s' -> '%s'\n", str, normalizeText(str))
}
}
Best Practices
✅ Good Practices
// Use runes for character-level operations
func reverseString(s string) string {
runes := []rune(s)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
runes[i], runes[j] = runes[j], runes[i]
}
return string(runes)
}
// Use bytes for binary data
func processData(data []byte) {
// Work with raw bytes
}
// Validate UTF-8
func processString(s string) error {
if !utf8.ValidString(s) {
return fmt.Errorf("invalid UTF-8")
}
// Process string
return nil
}
// Use unicode package for character properties
func isValidIdentifier(s string) bool {
for i, r := range s {
if i == 0 {
if !unicode.IsLetter(r) && r != '_' {
return false
}
} else {
if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '_' {
return false
}
}
}
return true
}
❌ Anti-Patterns
// Don't use len() for character count
count := len(str) // Wrong for Unicode!
// Don't index strings directly for Unicode
char := str[5] // May split multi-byte character
// Don't assume ASCII
for i := 0; i < len(str); i++ {
// Doesn't work correctly with Unicode
}
// Don't ignore encoding
data := []byte(str) // May lose information
Common Pitfalls
String Length Confusion
// ❌ Wrong: len() returns bytes, not characters
str := "Hello 世界"
fmt.Println(len(str)) // 12, not 8
// ✅ Correct: Use RuneCountInString
fmt.Println(utf8.RuneCountInString(str)) // 8
Indexing Issues
// ❌ Wrong: Byte indexing with Unicode
str := "世界"
fmt.Println(str[0]) // 228 (first byte of first character)
// ✅ Correct: Use rune conversion
runes := []rune(str)
fmt.Println(runes[0]) // 世 (first character)
Resources
- Go unicode Package Documentation
- Go unicode/utf8 Package Documentation
- UTF-8 Specification
- Unicode Standard
- Go Strings, Bytes, Runes and Characters
Summary
Understanding bytes, runes, and Unicode in Go is crucial for text processing:
- Bytes are 8-bit values; runes are Unicode code points
- Strings are UTF-8 encoded sequences of bytes
- Use
len()for byte count,utf8.RuneCountInString()for character count - Use
[]rune()for character-level operations - Use
unicodepackage for character classification - Always validate UTF-8 when processing external input
With these concepts, you can handle text correctly in any language and encoding.
Comments