File System Operations at Scale in Go
Introduction
Handling large-scale file system operations requires careful optimization. This guide covers efficient file handling, directory traversal, and batch operations.
Proper file system operations at scale prevent resource exhaustion and maintain performance.
Efficient File Operations
Buffered I/O
package main
import (
"bufio"
"fmt"
"os"
)
// ProcessLargeFile processes a large file efficiently
func ProcessLargeFile(filename string, processor func(string) error) error {
file, err := os.Open(filename)
if err != nil {
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
// Increase buffer size for large lines
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
if err := processor(scanner.Text()); err != nil {
return err
}
}
return scanner.Err()
}
// WriteToFile writes data efficiently
func WriteToFile(filename string, data []string) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
writer := bufio.NewWriter(file)
defer writer.Flush()
for _, line := range data {
if _, err := writer.WriteString(line + "\n"); err != nil {
return err
}
}
return nil
}
Good: Proper Large-Scale File Operations
package main
import (
"fmt"
"io/fs"
"os"
"path/filepath"
"sync"
)
// DirectoryWalker walks directories efficiently
type DirectoryWalker struct {
workers int
semaphore chan struct{}
}
// NewDirectoryWalker creates a new directory walker
func NewDirectoryWalker(workers int) *DirectoryWalker {
return &DirectoryWalker{
workers: workers,
semaphore: make(chan struct{}, workers),
}
}
// Walk walks a directory tree
func (dw *DirectoryWalker) Walk(root string, processor func(string, fs.FileInfo) error) error {
var wg sync.WaitGroup
return filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
wg.Add(1)
dw.semaphore <- struct{}{}
go func(p string) {
defer wg.Done()
defer func() { <-dw.semaphore }()
info, _ := d.Info()
processor(p, info)
}(path)
return nil
})
}
// BatchFileProcessor processes files in batches
type BatchFileProcessor struct {
batchSize int
processor func([]string) error
}
// NewBatchFileProcessor creates a new batch processor
func NewBatchFileProcessor(batchSize int, processor func([]string) error) *BatchFileProcessor {
return &BatchFileProcessor{
batchSize: batchSize,
processor: processor,
}
}
// ProcessDirectory processes all files in a directory
func (bfp *BatchFileProcessor) ProcessDirectory(dir string) error {
entries, err := os.ReadDir(dir)
if err != nil {
return err
}
var batch []string
for _, entry := range entries {
if entry.IsDir() {
continue
}
batch = append(batch, filepath.Join(dir, entry.Name()))
if len(batch) >= bfp.batchSize {
if err := bfp.processor(batch); err != nil {
return err
}
batch = batch[:0]
}
}
if len(batch) > 0 {
return bfp.processor(batch)
}
return nil
}
// FileSystemCache caches file information
type FileSystemCache struct {
cache map[string]fs.FileInfo
mu sync.RWMutex
}
// NewFileSystemCache creates a new cache
func NewFileSystemCache() *FileSystemCache {
return &FileSystemCache{
cache: make(map[string]fs.FileInfo),
}
}
// Get gets cached file info
func (fsc *FileSystemCache) Get(path string) (fs.FileInfo, bool) {
fsc.mu.RLock()
defer fsc.mu.RUnlock()
info, exists := fsc.cache[path]
return info, exists
}
// Set sets cached file info
func (fsc *FileSystemCache) Set(path string, info fs.FileInfo) {
fsc.mu.Lock()
defer fsc.mu.Unlock()
fsc.cache[path] = info
}
// Clear clears the cache
func (fsc *FileSystemCache) Clear() {
fsc.mu.Lock()
defer fsc.mu.Unlock()
fsc.cache = make(map[string]fs.FileInfo)
}
// DirectorySize calculates directory size
func DirectorySize(dir string) (int64, error) {
var size int64
err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() {
info, _ := d.Info()
size += info.Size()
}
return nil
})
return size, err
}
// FindFiles finds files matching criteria
func FindFiles(dir string, predicate func(fs.DirEntry) bool) ([]string, error) {
var files []string
err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && predicate(d) {
files = append(files, path)
}
return nil
})
return files, err
}
Bad: Improper Large-Scale Operations
package main
// BAD: Loading entire directory into memory
func BadLoadDirectory(dir string) {
entries, _ := os.ReadDir(dir)
// Processes all at once
for _, entry := range entries {
// Process
}
}
// BAD: No concurrency
func BadSequentialProcessing(files []string) {
for _, file := range files {
// Process one at a time
}
}
// BAD: No caching
func BadNoCaching(path string) {
// Stat called repeatedly
os.Stat(path)
os.Stat(path)
os.Stat(path)
}
Problems:
- No concurrency
- No caching
- No batching
- Memory inefficient
Best Practices
1. Use Buffered I/O
scanner := bufio.NewScanner(file)
scanner.Buffer(buf, maxSize)
2. Implement Concurrency
walker := NewDirectoryWalker(4)
walker.Walk(dir, processor)
3. Cache Results
cache := NewFileSystemCache()
info, exists := cache.Get(path)
4. Process in Batches
processor := NewBatchFileProcessor(100, handler)
processor.ProcessDirectory(dir)
Common Pitfalls
1. No Concurrency
Use goroutines for parallel processing.
2. No Caching
Cache file information.
3. No Batching
Process in batches.
4. Memory Issues
Stream data when possible.
Resources
Summary
Large-scale file operations require optimization. Key takeaways:
- Use buffered I/O
- Implement concurrency
- Cache file information
- Process in batches
- Monitor resource usage
- Handle errors properly
- Test with large datasets
By mastering large-scale file operations, you can handle massive file systems efficiently.
Comments