File System Operations at Scale in Go

Introduction

Handling large-scale file system operations requires careful optimization. This guide covers efficient file handling, directory traversal, and batch operations.

Proper file system operations at scale prevent resource exhaustion and maintain performance.

Efficient File Operations

Buffered I/O

package main

import (
	"bufio"
	"fmt"
	"os"
)

// ProcessLargeFile processes a large file efficiently
func ProcessLargeFile(filename string, processor func(string) error) error {
	file, err := os.Open(filename)
	if err != nil {
		return err
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	// Increase buffer size for large lines
	buf := make([]byte, 0, 64*1024)
	scanner.Buffer(buf, 1024*1024)

	for scanner.Scan() {
		if err := processor(scanner.Text()); err != nil {
			return err
		}
	}

	return scanner.Err()
}

// WriteToFile writes data efficiently
func WriteToFile(filename string, data []string) error {
	file, err := os.Create(filename)
	if err != nil {
		return err
	}
	defer file.Close()

	writer := bufio.NewWriter(file)
	defer writer.Flush()

	for _, line := range data {
		if _, err := writer.WriteString(line + "\n"); err != nil {
			return err
		}
	}

	return nil
}

Good: Proper Large-Scale File Operations

package main

import (
	"fmt"
	"io/fs"
	"os"
	"path/filepath"
	"sync"
)

// DirectoryWalker walks directories efficiently
type DirectoryWalker struct {
	workers int
	semaphore chan struct{}
}

// NewDirectoryWalker creates a new directory walker
func NewDirectoryWalker(workers int) *DirectoryWalker {
	return &DirectoryWalker{
		workers:   workers,
		semaphore: make(chan struct{}, workers),
	}
}

// Walk walks a directory tree
func (dw *DirectoryWalker) Walk(root string, processor func(string, fs.FileInfo) error) error {
	var wg sync.WaitGroup

	return filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
		if err != nil {
			return err
		}

		if d.IsDir() {
			return nil
		}

		wg.Add(1)
		dw.semaphore <- struct{}{}

		go func(p string) {
			defer wg.Done()
			defer func() { <-dw.semaphore }()

			info, _ := d.Info()
			processor(p, info)
		}(path)

		return nil
	})
}

// BatchFileProcessor processes files in batches
type BatchFileProcessor struct {
	batchSize int
	processor func([]string) error
}

// NewBatchFileProcessor creates a new batch processor
func NewBatchFileProcessor(batchSize int, processor func([]string) error) *BatchFileProcessor {
	return &BatchFileProcessor{
		batchSize: batchSize,
		processor: processor,
	}
}

// ProcessDirectory processes all files in a directory
func (bfp *BatchFileProcessor) ProcessDirectory(dir string) error {
	entries, err := os.ReadDir(dir)
	if err != nil {
		return err
	}

	var batch []string

	for _, entry := range entries {
		if entry.IsDir() {
			continue
		}

		batch = append(batch, filepath.Join(dir, entry.Name()))

		if len(batch) >= bfp.batchSize {
			if err := bfp.processor(batch); err != nil {
				return err
			}
			batch = batch[:0]
		}
	}

	if len(batch) > 0 {
		return bfp.processor(batch)
	}

	return nil
}

// FileSystemCache caches file information
type FileSystemCache struct {
	cache map[string]fs.FileInfo
	mu    sync.RWMutex
}

// NewFileSystemCache creates a new cache
func NewFileSystemCache() *FileSystemCache {
	return &FileSystemCache{
		cache: make(map[string]fs.FileInfo),
	}
}

// Get gets cached file info
func (fsc *FileSystemCache) Get(path string) (fs.FileInfo, bool) {
	fsc.mu.RLock()
	defer fsc.mu.RUnlock()

	info, exists := fsc.cache[path]
	return info, exists
}

// Set sets cached file info
func (fsc *FileSystemCache) Set(path string, info fs.FileInfo) {
	fsc.mu.Lock()
	defer fsc.mu.Unlock()

	fsc.cache[path] = info
}

// Clear clears the cache
func (fsc *FileSystemCache) Clear() {
	fsc.mu.Lock()
	defer fsc.mu.Unlock()

	fsc.cache = make(map[string]fs.FileInfo)
}

// DirectorySize calculates directory size
func DirectorySize(dir string) (int64, error) {
	var size int64

	err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
		if err != nil {
			return err
		}

		if !d.IsDir() {
			info, _ := d.Info()
			size += info.Size()
		}

		return nil
	})

	return size, err
}

// FindFiles finds files matching criteria
func FindFiles(dir string, predicate func(fs.DirEntry) bool) ([]string, error) {
	var files []string

	err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
		if err != nil {
			return err
		}

		if !d.IsDir() && predicate(d) {
			files = append(files, path)
		}

		return nil
	})

	return files, err
}

Bad: Improper Large-Scale Operations

package main

// BAD: Loading entire directory into memory
func BadLoadDirectory(dir string) {
	entries, _ := os.ReadDir(dir)
	// Processes all at once
	for _, entry := range entries {
		// Process
	}
}

// BAD: No concurrency
func BadSequentialProcessing(files []string) {
	for _, file := range files {
		// Process one at a time
	}
}

// BAD: No caching
func BadNoCaching(path string) {
	// Stat called repeatedly
	os.Stat(path)
	os.Stat(path)
	os.Stat(path)
}

Problems:

No concurrency
No caching
No batching
Memory inefficient

Best Practices

1. Use Buffered I/O

scanner := bufio.NewScanner(file)
scanner.Buffer(buf, maxSize)

2. Implement Concurrency

walker := NewDirectoryWalker(4)
walker.Walk(dir, processor)

3. Cache Results

cache := NewFileSystemCache()
info, exists := cache.Get(path)

4. Process in Batches

processor := NewBatchFileProcessor(100, handler)
processor.ProcessDirectory(dir)

Common Pitfalls

1. No Concurrency

Use goroutines for parallel processing.

2. No Caching

Cache file information.

3. No Batching

Process in batches.

4. Memory Issues

Stream data when possible.

Resources

Summary

Large-scale file operations require optimization. Key takeaways:

Use buffered I/O
Implement concurrency
Cache file information
Process in batches
Monitor resource usage
Handle errors properly
Test with large datasets

By mastering large-scale file operations, you can handle massive file systems efficiently.