Text Processing and String Algorithms in Go
Text processing is a fundamental skill in programming. Go provides excellent tools for working with strings and implementing text algorithms. This guide covers practical techniques and algorithms for text processing.
String Manipulation
Basic String Operations
package main
import (
"fmt"
"strings"
)
func main() {
text := "Hello, World!"
// Length
fmt.Println("Length:", len(text))
// Substring
fmt.Println("Substring:", text[0:5])
// Contains
fmt.Println("Contains 'World':", strings.Contains(text, "World"))
// Index
fmt.Println("Index of 'World':", strings.Index(text, "World"))
// Replace
fmt.Println("Replace:", strings.Replace(text, "World", "Go", 1))
// Split
parts := strings.Split(text, ", ")
fmt.Println("Split:", parts)
// Join
fmt.Println("Join:", strings.Join(parts, " - "))
// Trim
fmt.Println("Trim:", strings.Trim(text, "!"))
// ToUpper/ToLower
fmt.Println("Upper:", strings.ToUpper(text))
fmt.Println("Lower:", strings.ToLower(text))
}
String Building
package main
import (
"fmt"
"strings"
)
func main() {
// Using strings.Builder (efficient)
var builder strings.Builder
builder.WriteString("Hello")
builder.WriteString(" ")
builder.WriteString("World")
fmt.Println(builder.String())
// Using concatenation (less efficient)
result := "Hello" + " " + "World"
fmt.Println(result)
// Using fmt.Sprintf
name := "Alice"
age := 30
result = fmt.Sprintf("Name: %s, Age: %d", name, age)
fmt.Println(result)
}
Text Algorithms
Word Frequency Counter
package main
import (
"fmt"
"strings"
)
func countWords(text string) map[string]int {
words := strings.Fields(text)
frequency := make(map[string]int)
for _, word := range words {
// Normalize word
word = strings.ToLower(word)
word = strings.Trim(word, ".,!?;:")
frequency[word]++
}
return frequency
}
func main() {
text := "The quick brown fox jumps over the lazy dog. The dog was lazy."
freq := countWords(text)
for word, count := range freq {
fmt.Printf("%s: %d\n", word, count)
}
}
Palindrome Checker
package main
import (
"fmt"
"strings"
"unicode"
)
func isPalindrome(text string) bool {
// Remove spaces and convert to lowercase
cleaned := ""
for _, r := range text {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
cleaned += string(unicode.ToLower(r))
}
}
// Check if palindrome
for i, j := 0, len(cleaned)-1; i < j; i, j = i+1, j-1 {
if cleaned[i] != cleaned[j] {
return false
}
}
return true
}
func main() {
tests := []string{
"A man, a plan, a canal: Panama",
"race a car",
"hello",
}
for _, test := range tests {
fmt.Printf("'%s' is palindrome: %v\n", test, isPalindrome(test))
}
}
Longest Common Substring
package main
import (
"fmt"
)
func longestCommonSubstring(s1, s2 string) string {
if len(s1) == 0 || len(s2) == 0 {
return ""
}
// Create DP table
m, n := len(s1), len(s2)
dp := make([][]int, m+1)
for i := range dp {
dp[i] = make([]int, n+1)
}
maxLen := 0
endPos := 0
for i := 1; i <= m; i++ {
for j := 1; j <= n; j++ {
if s1[i-1] == s2[j-1] {
dp[i][j] = dp[i-1][j-1] + 1
if dp[i][j] > maxLen {
maxLen = dp[i][j]
endPos = i
}
}
}
}
return s1[endPos-maxLen : endPos]
}
func main() {
s1 := "abcdef"
s2 := "fbdamn"
fmt.Printf("LCS of '%s' and '%s': '%s'\n", s1, s2, longestCommonSubstring(s1, s2))
}
Levenshtein Distance
package main
import (
"fmt"
)
func levenshteinDistance(s1, s2 string) int {
m, n := len(s1), len(s2)
// Create DP table
dp := make([][]int, m+1)
for i := range dp {
dp[i] = make([]int, n+1)
}
// Initialize
for i := 0; i <= m; i++ {
dp[i][0] = i
}
for j := 0; j <= n; j++ {
dp[0][j] = j
}
// Fill table
for i := 1; i <= m; i++ {
for j := 1; j <= n; j++ {
if s1[i-1] == s2[j-1] {
dp[i][j] = dp[i-1][j-1]
} else {
dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
}
}
}
return dp[m][n]
}
func min(a, b, c int) int {
if a < b && a < c {
return a
}
if b < c {
return b
}
return c
}
func main() {
s1 := "kitten"
s2 := "sitting"
distance := levenshteinDistance(s1, s2)
fmt.Printf("Distance between '%s' and '%s': %d\n", s1, s2, distance)
}
Practical Examples
Text Summarization
package main
import (
"fmt"
"strings"
)
func summarize(text string, numSentences int) string {
// Split into sentences
sentences := strings.Split(text, ". ")
if len(sentences) <= numSentences {
return text
}
// Take first N sentences
summary := strings.Join(sentences[:numSentences], ". ")
if !strings.HasSuffix(summary, ".") {
summary += "."
}
return summary
}
func main() {
text := "Go is a programming language. It is fast and efficient. Go is used for many applications. It has great concurrency support."
fmt.Println("Original:")
fmt.Println(text)
fmt.Println("\nSummary (2 sentences):")
fmt.Println(summarize(text, 2))
}
Text Tokenization
package main
import (
"fmt"
"regexp"
"strings"
)
func tokenize(text string) []string {
// Remove punctuation
re := regexp.MustCompile(`[^\w\s]`)
text = re.ReplaceAllString(text, "")
// Split into words
words := strings.Fields(text)
// Convert to lowercase
for i, word := range words {
words[i] = strings.ToLower(word)
}
return words
}
func main() {
text := "Hello, World! How are you?"
tokens := tokenize(text)
fmt.Println("Tokens:", tokens)
}
Anagram Checker
package main
import (
"fmt"
"sort"
"strings"
)
func sortString(s string) string {
chars := strings.Split(s, "")
sort.Strings(chars)
return strings.Join(chars, "")
}
func areAnagrams(s1, s2 string) bool {
s1 = strings.ToLower(strings.ReplaceAll(s1, " ", ""))
s2 = strings.ToLower(strings.ReplaceAll(s2, " ", ""))
return sortString(s1) == sortString(s2)
}
func main() {
tests := []struct {
s1, s2 string
}{
{"listen", "silent"},
{"hello", "world"},
{"The Eyes", "They See"},
}
for _, test := range tests {
fmt.Printf("'%s' and '%s' are anagrams: %v\n", test.s1, test.s2, areAnagrams(test.s1, test.s2))
}
}
Best Practices
โ Good Practices
// Use strings.Builder for efficient concatenation
var builder strings.Builder
builder.WriteString("text")
// Use strings package functions
strings.Contains(text, "search")
strings.Split(text, ",")
// Normalize text before processing
text = strings.ToLower(text)
text = strings.TrimSpace(text)
// Use regex for complex patterns
re := regexp.MustCompile(`pattern`)
matches := re.FindAllString(text, -1)
โ Anti-Patterns
// Don't use string concatenation in loops
result := ""
for _, s := range strings {
result += s // Inefficient
}
// Don't ignore case sensitivity
if text == "Hello" // May miss variations
// Don't use regex for simple operations
// Use strings package instead
// Don't process untrusted input without validation
// Always validate and sanitize
Resources
- Go strings Package Documentation
- Go regexp Package Documentation
- String Algorithms
- Text Processing Best Practices
Summary
Text processing in Go involves:
- Efficient string manipulation with
stringspackage - Building strings with
strings.Builder - Implementing text algorithms (palindromes, distances, etc.)
- Using regex for pattern matching
- Normalizing and validating text
- Optimizing for performance
With these techniques, you can efficiently process and analyze text in Go applications.
Comments