Skip to main content
โšก Calmops

Bytes, Runes, and Unicode in Go

Bytes, Runes, and Unicode in Go

Understanding how Go handles bytes, runes, and Unicode is essential for working with text and strings. Go has excellent Unicode support built-in, but it’s important to understand the distinctions between bytes, runes, and strings. This guide covers everything you need to know.

Understanding Bytes and Runes

Bytes vs Runes

package main

import (
	"fmt"
)

func main() {
	// String with ASCII characters
	str := "Hello"
	fmt.Println("String:", str)
	fmt.Println("Length:", len(str))
	fmt.Println("Bytes:", []byte(str))

	// String with Unicode characters
	str2 := "Hello ไธ–็•Œ"
	fmt.Println("\nString:", str2)
	fmt.Println("Length (bytes):", len(str2))
	fmt.Println("Length (runes):", len([]rune(str2)))
	fmt.Println("Bytes:", []byte(str2))
	fmt.Println("Runes:", []rune(str2))
}

Iterating Over Strings

package main

import (
	"fmt"
)

func main() {
	str := "Hello ไธ–็•Œ"

	// Iterate by byte (wrong for Unicode)
	fmt.Println("By byte:")
	for i := 0; i < len(str); i++ {
		fmt.Printf("Index %d: %c (byte: %d)\n", i, str[i], str[i])
	}

	// Iterate by rune (correct for Unicode)
	fmt.Println("\nBy rune:")
	for i, r := range str {
		fmt.Printf("Index %d: %c (rune: %U)\n", i, r, r)
	}

	// Convert to runes for indexed access
	fmt.Println("\nUsing rune slice:")
	runes := []rune(str)
	for i, r := range runes {
		fmt.Printf("Index %d: %c (rune: %U)\n", i, r, r)
	}
}

Working with Bytes

Byte Slices

package main

import (
	"fmt"
)

func main() {
	// Create byte slice from string
	str := "Hello"
	bytes := []byte(str)
	fmt.Println("Bytes:", bytes)

	// Modify bytes
	bytes[0] = 'J'
	fmt.Println("Modified:", string(bytes))

	// Create byte slice directly
	data := []byte{72, 101, 108, 108, 111}
	fmt.Println("From bytes:", string(data))

	// Byte literals
	var b byte = 'A'
	fmt.Printf("Byte: %c (value: %d)\n", b, b)
}

Byte Operations

package main

import (
	"bytes"
	"fmt"
)

func main() {
	// Compare byte slices
	b1 := []byte("hello")
	b2 := []byte("hello")
	b3 := []byte("world")

	fmt.Println("b1 == b2:", bytes.Equal(b1, b2))
	fmt.Println("b1 == b3:", bytes.Equal(b1, b3))

	// Contains
	fmt.Println("Contains 'ell':", bytes.Contains(b1, []byte("ell")))

	// Index
	fmt.Println("Index of 'l':", bytes.Index(b1, []byte("l")))

	// Count
	fmt.Println("Count 'l':", bytes.Count(b1, []byte("l")))

	// Replace
	result := bytes.Replace(b1, []byte("l"), []byte("L"), -1)
	fmt.Println("Replace:", string(result))
}

Working with Runes

Rune Basics

package main

import (
	"fmt"
	"unicode"
)

func main() {
	// Rune is an alias for int32
	var r rune = 'A'
	fmt.Printf("Rune: %c (value: %d, Unicode: %U)\n", r, r, r)

	// Unicode runes
	r2 := 'ไธ–'
	fmt.Printf("Rune: %c (value: %d, Unicode: %U)\n", r2, r2, r2)

	// Rune from string
	str := "Hello"
	for _, r := range str {
		fmt.Printf("%c: %U\n", r, r)
	}

	// Check rune properties
	fmt.Println("\nRune properties:")
	fmt.Println("IsLetter('A'):", unicode.IsLetter('A'))
	fmt.Println("IsDigit('5'):", unicode.IsDigit('5'))
	fmt.Println("IsSpace(' '):", unicode.IsSpace(' '))
	fmt.Println("IsUpper('A'):", unicode.IsUpper('A'))
	fmt.Println("IsLower('a'):", unicode.IsLower('a'))
}

Rune Conversion

package main

import (
	"fmt"
	"unicode"
)

func main() {
	// Convert case
	r := 'a'
	fmt.Printf("'%c' to upper: '%c'\n", r, unicode.ToUpper(r))

	r = 'A'
	fmt.Printf("'%c' to lower: '%c'\n", r, unicode.ToLower(r))

	// Convert to title case
	r = 'a'
	fmt.Printf("'%c' to title: '%c'\n", r, unicode.ToTitle(r))

	// Get rune category
	fmt.Println("\nRune categories:")
	fmt.Println("Category of 'A':", unicode.Category('A'))
	fmt.Println("Category of '5':", unicode.Category('5'))
	fmt.Println("Category of ' ':", unicode.Category(' '))
}

Unicode and UTF-8

UTF-8 Encoding

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// UTF-8 encoding
	str := "Hello ไธ–็•Œ"
	fmt.Println("String:", str)
	fmt.Println("Byte length:", len(str))
	fmt.Println("Rune length:", utf8.RuneCountInString(str))

	// Encode rune to UTF-8
	r := 'ไธ–'
	buf := make([]byte, utf8.UTFMax)
	n := utf8.EncodeRune(buf, r)
	fmt.Printf("Encoded '%c': %v (length: %d)\n", r, buf[:n], n)

	// Decode rune from UTF-8
	bytes := []byte("ไธ–")
	r, size := utf8.DecodeRune(bytes)
	fmt.Printf("Decoded: '%c' (size: %d)\n", r, size)

	// Decode last rune
	r, size = utf8.DecodeLastRune(bytes)
	fmt.Printf("Last rune: '%c' (size: %d)\n", r, size)
}

Validating UTF-8

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	// Valid UTF-8
	validStr := "Hello ไธ–็•Œ"
	fmt.Println("Valid UTF-8:", utf8.ValidString(validStr))

	// Invalid UTF-8 (simulated)
	invalidBytes := []byte{0xFF, 0xFE}
	fmt.Println("Valid bytes:", utf8.Valid(invalidBytes))

	// Check if string is valid
	if utf8.ValidString(validStr) {
		fmt.Println("String is valid UTF-8")
	}
}

Text Processing

String Manipulation with Runes

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func main() {
	// Reverse string
	str := "Hello ไธ–็•Œ"
	runes := []rune(str)
	for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
		runes[i], runes[j] = runes[j], runes[i]
	}
	fmt.Println("Reversed:", string(runes))

	// Convert to uppercase
	fmt.Println("Uppercase:", strings.ToUpper(str))

	// Convert to lowercase
	fmt.Println("Lowercase:", strings.ToLower(str))

	// Title case
	fmt.Println("Title:", strings.Title(str))
}

Character Classification

package main

import (
	"fmt"
	"unicode"
)

func main() {
	str := "Hello123!@#"

	// Classify characters
	var letters, digits, spaces, others int

	for _, r := range str {
		switch {
		case unicode.IsLetter(r):
			letters++
		case unicode.IsDigit(r):
			digits++
		case unicode.IsSpace(r):
			spaces++
		default:
			others++
		}
	}

	fmt.Printf("Letters: %d, Digits: %d, Spaces: %d, Others: %d\n",
		letters, digits, spaces, others)
}

Filtering Characters

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func main() {
	str := "Hello123World!@#"

	// Keep only letters
	letters := strings.Map(func(r rune) rune {
		if unicode.IsLetter(r) {
			return r
		}
		return -1
	}, str)
	fmt.Println("Letters only:", letters)

	// Keep only alphanumeric
	alphanumeric := strings.Map(func(r rune) rune {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			return r
		}
		return -1
	}, str)
	fmt.Println("Alphanumeric:", alphanumeric)

	// Remove spaces
	noSpaces := strings.Map(func(r rune) rune {
		if unicode.IsSpace(r) {
			return -1
		}
		return r
	}, str)
	fmt.Println("No spaces:", noSpaces)
}

Practical Examples

Counting Unicode Characters

package main

import (
	"fmt"
	"unicode/utf8"
)

func countCharacters(str string) map[string]int {
	return map[string]int{
		"bytes": len(str),
		"runes": utf8.RuneCountInString(str),
	}
}

func main() {
	tests := []string{
		"Hello",
		"Hello ไธ–็•Œ",
		"๐ŸŽ‰๐ŸŽŠ๐ŸŽˆ",
		"Cafรฉ",
	}

	for _, str := range tests {
		counts := countCharacters(str)
		fmt.Printf("'%s': %d bytes, %d runes\n", str, counts["bytes"], counts["runes"])
	}
}

Substring Operations with Unicode

package main

import (
	"fmt"
)

func main() {
	str := "Hello ไธ–็•Œ"

	// Get substring by rune index
	runes := []rune(str)
	fmt.Println("Full string:", str)
	fmt.Println("Substring [0:5]:", string(runes[0:5]))
	fmt.Println("Substring [6:8]:", string(runes[6:8]))

	// Get first N characters
	fmt.Println("First 5 chars:", string(runes[:5]))

	// Get last N characters
	fmt.Println("Last 2 chars:", string(runes[len(runes)-2:]))
}

Text Normalization

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func normalizeText(str string) string {
	// Convert to lowercase and trim spaces
	str = strings.ToLower(str)
	str = strings.TrimSpace(str)

	// Remove extra spaces
	str = strings.Join(strings.Fields(str), " ")

	return str
}

func main() {
	tests := []string{
		"  Hello   World  ",
		"HELLO WORLD",
		"HeLLo WoRLd",
	}

	for _, str := range tests {
		fmt.Printf("'%s' -> '%s'\n", str, normalizeText(str))
	}
}

Best Practices

โœ… Good Practices

// Use runes for character-level operations
func reverseString(s string) string {
	runes := []rune(s)
	for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
		runes[i], runes[j] = runes[j], runes[i]
	}
	return string(runes)
}

// Use bytes for binary data
func processData(data []byte) {
	// Work with raw bytes
}

// Validate UTF-8
func processString(s string) error {
	if !utf8.ValidString(s) {
		return fmt.Errorf("invalid UTF-8")
	}
	// Process string
	return nil
}

// Use unicode package for character properties
func isValidIdentifier(s string) bool {
	for i, r := range s {
		if i == 0 {
			if !unicode.IsLetter(r) && r != '_' {
				return false
			}
		} else {
			if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '_' {
				return false
			}
		}
	}
	return true
}

โŒ Anti-Patterns

// Don't use len() for character count
count := len(str) // Wrong for Unicode!

// Don't index strings directly for Unicode
char := str[5] // May split multi-byte character

// Don't assume ASCII
for i := 0; i < len(str); i++ {
	// Doesn't work correctly with Unicode
}

// Don't ignore encoding
data := []byte(str) // May lose information

Common Pitfalls

String Length Confusion

// โŒ Wrong: len() returns bytes, not characters
str := "Hello ไธ–็•Œ"
fmt.Println(len(str)) // 12, not 8

// โœ… Correct: Use RuneCountInString
fmt.Println(utf8.RuneCountInString(str)) // 8

Indexing Issues

// โŒ Wrong: Byte indexing with Unicode
str := "ไธ–็•Œ"
fmt.Println(str[0]) // 228 (first byte of first character)

// โœ… Correct: Use rune conversion
runes := []rune(str)
fmt.Println(runes[0]) // ไธ– (first character)

Resources

Summary

Understanding bytes, runes, and Unicode in Go is crucial for text processing:

  • Bytes are 8-bit values; runes are Unicode code points
  • Strings are UTF-8 encoded sequences of bytes
  • Use len() for byte count, utf8.RuneCountInString() for character count
  • Use []rune() for character-level operations
  • Use unicode package for character classification
  • Always validate UTF-8 when processing external input

With these concepts, you can handle text correctly in any language and encoding.

Comments