Bytes, Runes, and Unicode in Go
Understanding how Go handles bytes, runes, and Unicode is essential for working with text and strings. Go has excellent Unicode support built-in, but it’s important to understand the distinctions between bytes, runes, and strings. This guide covers everything you need to know.
Understanding Bytes and Runes
Bytes vs Runes
package main
import (
"fmt"
)
func main() {
// String with ASCII characters
str := "Hello"
fmt.Println("String:", str)
fmt.Println("Length:", len(str))
fmt.Println("Bytes:", []byte(str))
// String with Unicode characters
str2 := "Hello ไธ็"
fmt.Println("\nString:", str2)
fmt.Println("Length (bytes):", len(str2))
fmt.Println("Length (runes):", len([]rune(str2)))
fmt.Println("Bytes:", []byte(str2))
fmt.Println("Runes:", []rune(str2))
}
Iterating Over Strings
package main
import (
"fmt"
)
func main() {
str := "Hello ไธ็"
// Iterate by byte (wrong for Unicode)
fmt.Println("By byte:")
for i := 0; i < len(str); i++ {
fmt.Printf("Index %d: %c (byte: %d)\n", i, str[i], str[i])
}
// Iterate by rune (correct for Unicode)
fmt.Println("\nBy rune:")
for i, r := range str {
fmt.Printf("Index %d: %c (rune: %U)\n", i, r, r)
}
// Convert to runes for indexed access
fmt.Println("\nUsing rune slice:")
runes := []rune(str)
for i, r := range runes {
fmt.Printf("Index %d: %c (rune: %U)\n", i, r, r)
}
}
Working with Bytes
Byte Slices
package main
import (
"fmt"
)
func main() {
// Create byte slice from string
str := "Hello"
bytes := []byte(str)
fmt.Println("Bytes:", bytes)
// Modify bytes
bytes[0] = 'J'
fmt.Println("Modified:", string(bytes))
// Create byte slice directly
data := []byte{72, 101, 108, 108, 111}
fmt.Println("From bytes:", string(data))
// Byte literals
var b byte = 'A'
fmt.Printf("Byte: %c (value: %d)\n", b, b)
}
Byte Operations
package main
import (
"bytes"
"fmt"
)
func main() {
// Compare byte slices
b1 := []byte("hello")
b2 := []byte("hello")
b3 := []byte("world")
fmt.Println("b1 == b2:", bytes.Equal(b1, b2))
fmt.Println("b1 == b3:", bytes.Equal(b1, b3))
// Contains
fmt.Println("Contains 'ell':", bytes.Contains(b1, []byte("ell")))
// Index
fmt.Println("Index of 'l':", bytes.Index(b1, []byte("l")))
// Count
fmt.Println("Count 'l':", bytes.Count(b1, []byte("l")))
// Replace
result := bytes.Replace(b1, []byte("l"), []byte("L"), -1)
fmt.Println("Replace:", string(result))
}
Working with Runes
Rune Basics
package main
import (
"fmt"
"unicode"
)
func main() {
// Rune is an alias for int32
var r rune = 'A'
fmt.Printf("Rune: %c (value: %d, Unicode: %U)\n", r, r, r)
// Unicode runes
r2 := 'ไธ'
fmt.Printf("Rune: %c (value: %d, Unicode: %U)\n", r2, r2, r2)
// Rune from string
str := "Hello"
for _, r := range str {
fmt.Printf("%c: %U\n", r, r)
}
// Check rune properties
fmt.Println("\nRune properties:")
fmt.Println("IsLetter('A'):", unicode.IsLetter('A'))
fmt.Println("IsDigit('5'):", unicode.IsDigit('5'))
fmt.Println("IsSpace(' '):", unicode.IsSpace(' '))
fmt.Println("IsUpper('A'):", unicode.IsUpper('A'))
fmt.Println("IsLower('a'):", unicode.IsLower('a'))
}
Rune Conversion
package main
import (
"fmt"
"unicode"
)
func main() {
// Convert case
r := 'a'
fmt.Printf("'%c' to upper: '%c'\n", r, unicode.ToUpper(r))
r = 'A'
fmt.Printf("'%c' to lower: '%c'\n", r, unicode.ToLower(r))
// Convert to title case
r = 'a'
fmt.Printf("'%c' to title: '%c'\n", r, unicode.ToTitle(r))
// Get rune category
fmt.Println("\nRune categories:")
fmt.Println("Category of 'A':", unicode.Category('A'))
fmt.Println("Category of '5':", unicode.Category('5'))
fmt.Println("Category of ' ':", unicode.Category(' '))
}
Unicode and UTF-8
UTF-8 Encoding
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// UTF-8 encoding
str := "Hello ไธ็"
fmt.Println("String:", str)
fmt.Println("Byte length:", len(str))
fmt.Println("Rune length:", utf8.RuneCountInString(str))
// Encode rune to UTF-8
r := 'ไธ'
buf := make([]byte, utf8.UTFMax)
n := utf8.EncodeRune(buf, r)
fmt.Printf("Encoded '%c': %v (length: %d)\n", r, buf[:n], n)
// Decode rune from UTF-8
bytes := []byte("ไธ")
r, size := utf8.DecodeRune(bytes)
fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)
// Decode last rune
r, size = utf8.DecodeLastRune(bytes)
fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}
Validating UTF-8
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
// Valid UTF-8
validStr := "Hello ไธ็"
fmt.Println("Valid UTF-8:", utf8.ValidString(validStr))
// Invalid UTF-8 (simulated)
invalidBytes := []byte{0xFF, 0xFE}
fmt.Println("Valid bytes:", utf8.Valid(invalidBytes))
// Check if string is valid
if utf8.ValidString(validStr) {
fmt.Println("String is valid UTF-8")
}
}
Text Processing
String Manipulation with Runes
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
// Reverse string
str := "Hello ไธ็"
runes := []rune(str)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
runes[i], runes[j] = runes[j], runes[i]
}
fmt.Println("Reversed:", string(runes))
// Convert to uppercase
fmt.Println("Uppercase:", strings.ToUpper(str))
// Convert to lowercase
fmt.Println("Lowercase:", strings.ToLower(str))
// Title case
fmt.Println("Title:", strings.Title(str))
}
Character Classification
package main
import (
"fmt"
"unicode"
)
func main() {
str := "Hello123!@#"
// Classify characters
var letters, digits, spaces, others int
for _, r := range str {
switch {
case unicode.IsLetter(r):
letters++
case unicode.IsDigit(r):
digits++
case unicode.IsSpace(r):
spaces++
default:
others++
}
}
fmt.Printf("Letters: %d, Digits: %d, Spaces: %d, Others: %d\n",
letters, digits, spaces, others)
}
Filtering Characters
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
str := "Hello123World!@#"
// Keep only letters
letters := strings.Map(func(r rune) rune {
if unicode.IsLetter(r) {
return r
}
return -1
}, str)
fmt.Println("Letters only:", letters)
// Keep only alphanumeric
alphanumeric := strings.Map(func(r rune) rune {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
return r
}
return -1
}, str)
fmt.Println("Alphanumeric:", alphanumeric)
// Remove spaces
noSpaces := strings.Map(func(r rune) rune {
if unicode.IsSpace(r) {
return -1
}
return r
}, str)
fmt.Println("No spaces:", noSpaces)
}
Practical Examples
Counting Unicode Characters
package main
import (
"fmt"
"unicode/utf8"
)
func countCharacters(str string) map[string]int {
return map[string]int{
"bytes": len(str),
"runes": utf8.RuneCountInString(str),
}
}
func main() {
tests := []string{
"Hello",
"Hello ไธ็",
"๐๐๐",
"Cafรฉ",
}
for _, str := range tests {
counts := countCharacters(str)
fmt.Printf("'%s': %d bytes, %d runes\n", str, counts["bytes"], counts["runes"])
}
}
Substring Operations with Unicode
package main
import (
"fmt"
)
func main() {
str := "Hello ไธ็"
// Get substring by rune index
runes := []rune(str)
fmt.Println("Full string:", str)
fmt.Println("Substring [0:5]:", string(runes[0:5]))
fmt.Println("Substring [6:8]:", string(runes[6:8]))
// Get first N characters
fmt.Println("First 5 chars:", string(runes[:5]))
// Get last N characters
fmt.Println("Last 2 chars:", string(runes[len(runes)-2:]))
}
Text Normalization
package main
import (
"fmt"
"strings"
"unicode"
)
func normalizeText(str string) string {
// Convert to lowercase and trim spaces
str = strings.ToLower(str)
str = strings.TrimSpace(str)
// Remove extra spaces
str = strings.Join(strings.Fields(str), " ")
return str
}
func main() {
tests := []string{
" Hello World ",
"HELLO WORLD",
"HeLLo WoRLd",
}
for _, str := range tests {
fmt.Printf("'%s' -> '%s'\n", str, normalizeText(str))
}
}
Best Practices
โ Good Practices
// Use runes for character-level operations
func reverseString(s string) string {
runes := []rune(s)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
runes[i], runes[j] = runes[j], runes[i]
}
return string(runes)
}
// Use bytes for binary data
func processData(data []byte) {
// Work with raw bytes
}
// Validate UTF-8
func processString(s string) error {
if !utf8.ValidString(s) {
return fmt.Errorf("invalid UTF-8")
}
// Process string
return nil
}
// Use unicode package for character properties
func isValidIdentifier(s string) bool {
for i, r := range s {
if i == 0 {
if !unicode.IsLetter(r) && r != '_' {
return false
}
} else {
if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '_' {
return false
}
}
}
return true
}
โ Anti-Patterns
// Don't use len() for character count
count := len(str) // Wrong for Unicode!
// Don't index strings directly for Unicode
char := str[5] // May split multi-byte character
// Don't assume ASCII
for i := 0; i < len(str); i++ {
// Doesn't work correctly with Unicode
}
// Don't ignore encoding
data := []byte(str) // May lose information
Common Pitfalls
String Length Confusion
// โ Wrong: len() returns bytes, not characters
str := "Hello ไธ็"
fmt.Println(len(str)) // 12, not 8
// โ
Correct: Use RuneCountInString
fmt.Println(utf8.RuneCountInString(str)) // 8
Indexing Issues
// โ Wrong: Byte indexing with Unicode
str := "ไธ็"
fmt.Println(str[0]) // 228 (first byte of first character)
// โ
Correct: Use rune conversion
runes := []rune(str)
fmt.Println(runes[0]) // ไธ (first character)
Resources
- Go unicode Package Documentation
- Go unicode/utf8 Package Documentation
- UTF-8 Specification
- Unicode Standard
- Go Strings, Bytes, Runes and Characters
Summary
Understanding bytes, runes, and Unicode in Go is crucial for text processing:
- Bytes are 8-bit values; runes are Unicode code points
- Strings are UTF-8 encoded sequences of bytes
- Use
len()for byte count,utf8.RuneCountInString()for character count - Use
[]rune()for character-level operations - Use
unicodepackage for character classification - Always validate UTF-8 when processing external input
With these concepts, you can handle text correctly in any language and encoding.
Comments