Text Processing and String Algorithms in Go

Text processing is a fundamental skill in programming. Go provides excellent tools for working with strings and implementing text algorithms. This guide covers practical techniques and algorithms for text processing.

String Manipulation

Basic String Operations

package main

import (
	"fmt"
	"strings"
)

func main() {
	text := "Hello, World!"

	// Length
	fmt.Println("Length:", len(text))

	// Substring
	fmt.Println("Substring:", text[0:5])

	// Contains
	fmt.Println("Contains 'World':", strings.Contains(text, "World"))

	// Index
	fmt.Println("Index of 'World':", strings.Index(text, "World"))

	// Replace
	fmt.Println("Replace:", strings.Replace(text, "World", "Go", 1))

	// Split
	parts := strings.Split(text, ", ")
	fmt.Println("Split:", parts)

	// Join
	fmt.Println("Join:", strings.Join(parts, " - "))

	// Trim
	fmt.Println("Trim:", strings.Trim(text, "!"))

	// ToUpper/ToLower
	fmt.Println("Upper:", strings.ToUpper(text))
	fmt.Println("Lower:", strings.ToLower(text))
}

String Building

package main

import (
	"fmt"
	"strings"
)

func main() {
	// Using strings.Builder (efficient)
	var builder strings.Builder

	builder.WriteString("Hello")
	builder.WriteString(" ")
	builder.WriteString("World")

	fmt.Println(builder.String())

	// Using concatenation (less efficient)
	result := "Hello" + " " + "World"
	fmt.Println(result)

	// Using fmt.Sprintf
	name := "Alice"
	age := 30
	result = fmt.Sprintf("Name: %s, Age: %d", name, age)
	fmt.Println(result)
}

Text Algorithms

Word Frequency Counter

package main

import (
	"fmt"
	"strings"
)

func countWords(text string) map[string]int {
	words := strings.Fields(text)
	frequency := make(map[string]int)

	for _, word := range words {
		// Normalize word
		word = strings.ToLower(word)
		word = strings.Trim(word, ".,!?;:")

		frequency[word]++
	}

	return frequency
}

func main() {
	text := "The quick brown fox jumps over the lazy dog. The dog was lazy."
	freq := countWords(text)

	for word, count := range freq {
		fmt.Printf("%s: %d\n", word, count)
	}
}

Palindrome Checker

package main

import (
	"fmt"
	"strings"
	"unicode"
)

func isPalindrome(text string) bool {
	// Remove spaces and convert to lowercase
	cleaned := ""
	for _, r := range text {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			cleaned += string(unicode.ToLower(r))
		}
	}

	// Check if palindrome
	for i, j := 0, len(cleaned)-1; i < j; i, j = i+1, j-1 {
		if cleaned[i] != cleaned[j] {
			return false
		}
	}

	return true
}

func main() {
	tests := []string{
		"A man, a plan, a canal: Panama",
		"race a car",
		"hello",
	}

	for _, test := range tests {
		fmt.Printf("'%s' is palindrome: %v\n", test, isPalindrome(test))
	}
}

Longest Common Substring

package main

import (
	"fmt"
)

func longestCommonSubstring(s1, s2 string) string {
	if len(s1) == 0 || len(s2) == 0 {
		return ""
	}

	// Create DP table
	m, n := len(s1), len(s2)
	dp := make([][]int, m+1)
	for i := range dp {
		dp[i] = make([]int, n+1)
	}

	maxLen := 0
	endPos := 0

	for i := 1; i <= m; i++ {
		for j := 1; j <= n; j++ {
			if s1[i-1] == s2[j-1] {
				dp[i][j] = dp[i-1][j-1] + 1
				if dp[i][j] > maxLen {
					maxLen = dp[i][j]
					endPos = i
				}
			}
		}
	}

	return s1[endPos-maxLen : endPos]
}

func main() {
	s1 := "abcdef"
	s2 := "fbdamn"

	fmt.Printf("LCS of '%s' and '%s': '%s'\n", s1, s2, longestCommonSubstring(s1, s2))
}

Levenshtein Distance

package main

import (
	"fmt"
)

func levenshteinDistance(s1, s2 string) int {
	m, n := len(s1), len(s2)

	// Create DP table
	dp := make([][]int, m+1)
	for i := range dp {
		dp[i] = make([]int, n+1)
	}

	// Initialize
	for i := 0; i <= m; i++ {
		dp[i][0] = i
	}
	for j := 0; j <= n; j++ {
		dp[0][j] = j
	}

	// Fill table
	for i := 1; i <= m; i++ {
		for j := 1; j <= n; j++ {
			if s1[i-1] == s2[j-1] {
				dp[i][j] = dp[i-1][j-1]
			} else {
				dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
			}
		}
	}

	return dp[m][n]
}

func min(a, b, c int) int {
	if a < b && a < c {
		return a
	}
	if b < c {
		return b
	}
	return c
}

func main() {
	s1 := "kitten"
	s2 := "sitting"

	distance := levenshteinDistance(s1, s2)
	fmt.Printf("Distance between '%s' and '%s': %d\n", s1, s2, distance)
}

Practical Examples

Text Summarization

package main

import (
	"fmt"
	"strings"
)

func summarize(text string, numSentences int) string {
	// Split into sentences
	sentences := strings.Split(text, ". ")

	if len(sentences) <= numSentences {
		return text
	}

	// Take first N sentences
	summary := strings.Join(sentences[:numSentences], ". ")
	if !strings.HasSuffix(summary, ".") {
		summary += "."
	}

	return summary
}

func main() {
	text := "Go is a programming language. It is fast and efficient. Go is used for many applications. It has great concurrency support."

	fmt.Println("Original:")
	fmt.Println(text)
	fmt.Println("\nSummary (2 sentences):")
	fmt.Println(summarize(text, 2))
}

Text Tokenization

package main

import (
	"fmt"
	"regexp"
	"strings"
)

func tokenize(text string) []string {
	// Remove punctuation
	re := regexp.MustCompile(`[^\w\s]`)
	text = re.ReplaceAllString(text, "")

	// Split into words
	words := strings.Fields(text)

	// Convert to lowercase
	for i, word := range words {
		words[i] = strings.ToLower(word)
	}

	return words
}

func main() {
	text := "Hello, World! How are you?"
	tokens := tokenize(text)

	fmt.Println("Tokens:", tokens)
}

Anagram Checker

package main

import (
	"fmt"
	"sort"
	"strings"
)

func sortString(s string) string {
	chars := strings.Split(s, "")
	sort.Strings(chars)
	return strings.Join(chars, "")
}

func areAnagrams(s1, s2 string) bool {
	s1 = strings.ToLower(strings.ReplaceAll(s1, " ", ""))
	s2 = strings.ToLower(strings.ReplaceAll(s2, " ", ""))

	return sortString(s1) == sortString(s2)
}

func main() {
	tests := []struct {
		s1, s2 string
	}{
		{"listen", "silent"},
		{"hello", "world"},
		{"The Eyes", "They See"},
	}

	for _, test := range tests {
		fmt.Printf("'%s' and '%s' are anagrams: %v\n", test.s1, test.s2, areAnagrams(test.s1, test.s2))
	}
}

Best Practices

✅ Good Practices

// Use strings.Builder for efficient concatenation
var builder strings.Builder
builder.WriteString("text")

// Use strings package functions
strings.Contains(text, "search")
strings.Split(text, ",")

// Normalize text before processing
text = strings.ToLower(text)
text = strings.TrimSpace(text)

// Use regex for complex patterns
re := regexp.MustCompile(`pattern`)
matches := re.FindAllString(text, -1)

❌ Anti-Patterns

// Don't use string concatenation in loops
result := ""
for _, s := range strings {
	result += s // Inefficient
}

// Don't ignore case sensitivity
if text == "Hello" // May miss variations

// Don't use regex for simple operations
// Use strings package instead

// Don't process untrusted input without validation
// Always validate and sanitize

Resources

Summary

Text processing in Go involves:

Efficient string manipulation with strings package
Building strings with strings.Builder
Implementing text algorithms (palindromes, distances, etc.)
Using regex for pattern matching
Normalizing and validating text
Optimizing for performance

With these techniques, you can efficiently process and analyze text in Go applications.