10 Essential File System Operations Every Developer Should Know

Working with files and directories is a fundamental part of Python development, yet many developers stick to basic operations without exploring the full power of Python’s path handling capabilities. Whether you’re building web applications, data processing pipelines, or automation scripts, mastering these essential file system patterns will make your code more robust, efficient, and maintainable.
1. Smart File Discovery with Glob Patterns
You often need to find files based on a pattern, not just an exact name. The Path object’s glob
method is a powerful and intuitive tool for this.
Basic & Recursive Search
Let’s say you have a project in a src/
folder. Here’s how to find all your Python files.
from pathlib import Path
# The Path object is your main tool for filesystem operations.
project_dir = Path("src/")
# 1. Find all Python files in the top-level of the 'src' directory.
# The asterisk (*) is a wildcard for "match anything".
print("--- Top-level .py files ---")
for f in project_dir.glob("*.py"):
print(f)
# 2. Find all Python files RECURSIVELY through all subdirectories.
# The 'rglob' method is your best friend for deep searches.
print("\n--- All .py files in the project ---")
for f in project_dir.rglob("*.py"):
print(f)
# Example Output:
# --- Top-level .py files ---
# src/main.py
# --- All .py files in the project ---
# src/main.py
# src/utils/helpers.py
# src/api/models.py
Advanced Pattern Matching
glob
supports more than just *
. You can use ?
to match any single character and []
to match a range of characters, just like in the shell.
from pathlib import Path
# To run this, create a 'logs' directory with the example files.
logs_dir = Path("logs/")
logs_dir.mkdir(exist_ok=True)
Path("logs/app1.log").touch()
Path("logs/app2.log").touch()
Path("logs/app_extra.log").touch()
Path("logs/2023-10-01.log").touch()
Path("logs/2023-11-01.log").touch()
# Find logs like 'app1.log', 'app2.log', but not 'app_extra.log'
print("--- Single character wildcard ---")
for f in logs_dir.glob("app?.log"):
print(f)
# Find logs from October or November 2023
print("\n--- Character range ---")
for f in logs_dir.glob("2023-[10-11]-*.log"):
print(f)
# Example Output:
# --- Single character wildcard ---
# logs/app1.log
# logs/app2.log
#
# --- Character range ---
# logs/2023-10-01.log
# logs/2023-11-01.log
2. Navigate Directories with Precision
Sometimes you need more control than rglob
offers, like when you need to skip specific directories. Instead of falling back to os.walk, you can write a clean, recursive function using pathlib
’s own methods.
from pathlib import Path
def smart_directory_walk(root_path, skip_dirs=None, file_patterns=None):
"""
Walks a directory tree using pathlib, allowing you to skip entire
subtrees and only yield files matching specific patterns.
"""
if skip_dirs is None:
# A set is used for fast lookups.
skip_dirs = {'.git', '__pycache__', 'node_modules', '.venv'}
if file_patterns is None:
file_patterns = ["*"]
root = Path(root_path)
for item in root.iterdir():
# If the item is a directory, decide whether to recurse into it.
if item.is_dir() and item.name not in skip_dirs:
# yield from is a clean way to pass on the results from the recursive call.
yield from smart_directory_walk(item, skip_dirs, file_patterns)
# If it's a file, check if it matches our patterns.
elif item.is_file():
if any(item.match(p) for p in file_patterns):
yield item
if __name__ == "__main__":
print("Searching for Python and text files, skipping standard virtual envs:")
# To run this, create some dummy files and folders.
project_root = Path(".")
(project_root / "src").mkdir(exist_ok=True)
(project_root / "src" / "main.py").touch()
(project_root / ".venv").mkdir(exist_ok=True)
(project_root / ".venv" / "ignored.py").touch()
for f in smart_directory_walk(project_root, file_patterns=["*.py", "*.txt"]):
print(f"Found: {f}")
3. Write Files Atomically for Data Safety
What happens if your script crashes halfway through writing config.json? You get a corrupt file. An atomic operation prevents this: it’s an all-or-nothing action. The standard way to do this is to write to a temporary file and then perform a single, atomic move
operation.
import tempfile
import shutil
import os
from pathlib import Path
def atomic_write(file_path, content, encoding='utf-8'):
"""
Writes content to a file atomically to prevent data corruption.
"""
target_path = Path(file_path)
target_path.parent.mkdir(parents=True, exist_ok=True)
# Create a temporary file in the same directory as the final file.
# This is crucial because moving a file on the same filesystem is atomic.
with tempfile.NamedTemporaryFile(
mode='w',
encoding=encoding,
dir=target_path.parent,
delete=False,
suffix='.tmp'
) as tmp_file:
tmp_file.write(content)
# For critical data, os.fsync() ensures the data is physically written
# to the disk. This is one of the few cases where the os module is
# still necessary for low-level control.
tmp_file.flush()
os.fsync(tmp_file.fileno())
temp_path = tmp_file.name
# The atomic operation: rename the temporary file to the final destination.
# shutil.move is smart and works across different filesystems.
shutil.move(temp_path, target_path)
print(f"Atomically wrote to {target_path}")
if __name__ == "__main__":
# This will create 'config/settings.json' safely.
atomic_write("config/settings.json", '{"theme": "dark", "retries": 3}')
4. Master Temporary Files and Directories
Temporary files are essential for intermediate processing. Python’s tempfile
module integrates beautifully with pathlib
, creating secure files and directories that automatically clean themselves up.
import tempfile
import json
from pathlib import Path
# A dummy function to simulate work
def process_file(file_path):
print(f"Processing {file_path}...")
return f"processed_{file_path.name}"
def batch_process_data(list_of_data):
"""
Uses a temporary directory to process a batch of data, ensuring
all intermediate files are cleaned up automatically.
"""
# TemporaryDirectory creates a dir that is removed when the 'with' block exits.
with tempfile.TemporaryDirectory() as tmp_dir_str:
tmp_dir = Path(tmp_dir_str)
print(f"Created temporary directory: {tmp_dir}")
results = []
for i, data_item in enumerate(list_of_data):
# Create a temporary Path object inside our temporary directory
temp_file = tmp_dir / f"input_{i}.json"
temp_file.write_text(json.dumps(data_item))
result = process_file(temp_file)
results.append(result)
# The 'tmp_dir' and all its contents are automatically deleted here.
return results
if __name__ == "__main__":
data_to_process = [{"id": 1, "value": "A"}, {"id": 2, "value": "B"}]
final_results = batch_process_data(data_to_process)
print(f"\nFinal results: {final_results}")
print("Temporary directory has been removed.")
5. Validate and Sanitize User-Provided Paths
Never trust a path from an external source. A malicious user could provide ../../etc/passwd
to try to read sensitive files. You must validate and sanitize any external path inputs using pathlib
’s built-in security features.
import re
from pathlib import Path
def sanitize_filename(filename, replacement='_'):
"""
Cleans a string so it's a valid and safe filename for any OS.
"""
invalid_chars = r'[<>:"/\\|?*\x00-\x1f]'
sanitized = re.sub(invalid_chars, replacement, filename)
# Additional checks can be added here (e.g., for Windows reserved names)
return sanitized.strip(' .')
def validate_and_resolve_path(base_dir, user_path_str):
"""
Safely resolves a user-provided path, ensuring it stays within a base directory.
This is CRITICAL for preventing directory traversal attacks.
"""
base_dir = Path(base_dir).resolve()
# resolve() creates a canonical, absolute path, cleaning up '..' segments.
resolved_path = (base_dir / user_path_str).resolve()
# The key security check: is the final path still inside our safe base_dir?
# Path.is_relative_to() was added in Python 3.9 and is perfect for this.
if resolved_path.is_relative_to(base_dir):
return resolved_path
else:
raise PermissionError("Path traversal attempt detected.")
if __name__ == "__main__":
# 1. Sanitize a potentially messy filename
dirty_name = "My Report: Part 1/2 <Final?>.docx"
clean_name = sanitize_filename(dirty_name)
print(f"Sanitized '{dirty_name}' to '{clean_name}'")
# 2. Validate a user path
upload_dir = "uploads"
Path(upload_dir).mkdir(exist_ok=True)
try:
# Safe path
safe_path = validate_and_resolve_path(upload_dir, "images/profile.jpg")
print(f"OK: Path is safe: {safe_path}")
# Malicious path
malicious_path_str = "../../../etc/hosts"
print(f"\nTesting malicious path: '{malicious_path_str}'")
validate_and_resolve_path(upload_dir, malicious_path_str)
except PermissionError as e:
print(f"ERROR: {e}")
6. Calculate Directory Size with pathlib
Calculating directory size is a classic task. While os.scandir
is known for speed, a pure pathlib
approach using rglob
is often more readable and convenient for all but the most performance-critical applications on massive filesystems.
from pathlib import Path
def calculate_directory_size(directory):
"""
Calculates the total size of a directory and all its subdirectories
using a readable, pure-pathlib approach.
"""
dir_path = Path(directory)
total_size = 0
# rglob('*') is a generator, so it doesn't load all paths into memory at once.
for path in dir_path.rglob('*'):
# We only sum up the size of files.
if path.is_file():
try:
# path.stat().st_size gives the size in bytes.
total_size += path.stat().st_size
except (PermissionError, FileNotFoundError):
# Ignore files we can't access.
continue
return total_size
def format_size(size_bytes):
"""Formats bytes into a human-readable string (KB, MB, GB)."""
if size_bytes == 0:
return "0B"
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
if __name__ == "__main__":
target_directory = "."
print(f"Calculating size of '{Path(target_directory).resolve()}'...")
total_bytes = calculate_directory_size(target_directory)
print(f"Total size: {format_size(total_bytes)} ({total_bytes:,} bytes)")
7. Copy Large Files with Progress and Verification
When copying large files, you want to give the user feedback and ensure the copied file isn’t corrupt. This pattern combines shutil
for copying, hashlib
for data integrity, and the tqdm
library for a beautiful progress bar.
Note: You’ll need to install tqdm
: pip install tqdm
import shutil
import hashlib
import os
from pathlib import Path
from tqdm import tqdm
def calculate_file_hash(file_path, algorithm='sha256', chunk_size=65536):
"""Calculates the hash of a file."""
hash_obj = hashlib.new(algorithm)
with Path(file_path).open('rb') as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
hash_obj.update(chunk)
return hash_obj.hexdigest()
def copy_with_verification(src, dst):
"""
Copies a file with a progress bar and verifies the copy's integrity.
"""
src_path, dst_path = Path(src), Path(dst)
dst_path.parent.mkdir(parents=True, exist_ok=True)
src_size = src_path.stat().st_size
print(f"Calculating hash for {src_path.name}...")
src_hash = calculate_file_hash(src_path)
print(f"Copying {src_path.name} to {dst_path}...")
with src_path.open('rb') as fsrc, \
dst_path.open('wb') as fdst, \
tqdm(total=src_size, unit='B', unit_scale=True, desc=src_path.name) as pbar: \
shutil.copyfileobj(fsrc, fdst, length=16*1024*1024)
# Manually update progress bar if copyfileobj finishes before tqdm updates
pbar.n = src_size
pbar.refresh()
print("Verifying copy...")
dst_hash = calculate_file_hash(dst_path)
if src_hash != dst_hash:
dst_path.unlink() # Delete the corrupt copy
raise IOError(f"Verification failed! Hashes do not match for {dst_path}")
print(f"Success! {dst_path.name} copied and verified.")
return dst_path
if __name__ == "__main__":
source_file = Path("large_file.dat")
if not source_file.exists():
print(f"Creating dummy file '{source_file}'...")
# os.urandom is used here simply to get random bytes for a dummy file.
source_file.write_bytes(os.urandom(50 * 1024 * 1024)) # 50 MB
try:
copy_with_verification(source_file, Path("backup/large_file.dat"))
except (IOError, FileNotFoundError) as e:
print(f"An error occurred: {e}")
8. Monitor a Directory for Changes
Want to automatically process files dropped into a folder? You need a file system watcher. While libraries like watchdog
are best for production, it’s great to know how to build a simple one yourself using pathlib
and polling.
import time
from pathlib import Path
from collections import defaultdict
class SimpleFileWatcher:
"""A basic file watcher that polls a directory for changes."""
def __init__(self, watch_directory, patterns=None):
self.watch_dir = Path(watch_directory)
self.patterns = patterns or ["*"]
self._file_states = {} # Stores {path: mtime}
self.callbacks = defaultdict(list)
def on(self, event_type, callback):
self.callbacks[event_type].append(callback)
def _trigger(self, event_type, file_path):
for callback in self.callbacks[event_type]:
try:
callback(file_path)
except Exception as e:
print(f"Error in callback: {e}")
def watch(self, poll_interval=1.0):
print(f"Watching {self.watch_dir.resolve()}... (Press Ctrl+C to stop)")
try:
while True:
self._scan()
time.sleep(poll_interval)
except KeyboardInterrupt:
print("\nStopping watcher.")
def _scan(self):
current_files = set()
for pattern in self.patterns:
for path in self.watch_dir.rglob(pattern):
if path.is_file():
current_files.add(path)
try:
mtime = path.stat().st_mtime
if path not in self._file_states:
self._file_states[path] = mtime
self._trigger('created', path)
elif self._file_states[path] != mtime:
self._file_states[path] = mtime
self._trigger('modified', path)
except FileNotFoundError:
continue
deleted_files = set(self._file_states.keys()) - current_files
for path in deleted_files:
del self._file_states[path]
self._trigger('deleted', path)
def log_change(event):
def handler(path):
print(f"[{event.upper()}] - {path.name} at {time.ctime()}")
return handler
if __name__ == "__main__":
watch_folder = Path("watched_folder")
watch_folder.mkdir(exist_ok=True)
watcher = SimpleFileWatcher(watch_folder, patterns=["*.txt", "*.csv"])
watcher.on('created', log_change('created'))
watcher.on('modified', log_change('modified'))
print("Watcher started. Try creating/editing files in 'watched_folder'.")
watcher.watch()
9. Manage Configuration Files Flexibly
Applications need configuration from files (JSON, YAML, INI). This ConfigManager
handles different formats gracefully, using pathlib
to manage paths and our atomic_write
function to save settings safely.
Note: You’ll need PyYAML
: pip install pyyaml
import json
import yaml
import configparser
from pathlib import Path
from typing import Any, Dict
# In a real project, the 'atomic_write' function from Pattern 3 would be
# in a shared utils.py file. We assume it's available here.
class ConfigManager:
"""A manager for loading and saving config files in various formats."""
def __init__(self, config_path: str):
self.path = Path(config_path)
self.type = self.path.suffix.lower().strip('.')
def load(self) -> Dict[str, Any]:
if not self.path.exists():
return {}
try:
with self.path.open('r', encoding='utf-8') as f:
if self.type == 'json':
return json.load(f)
elif self.type in ['yaml', 'yml']:
return yaml.safe_load(f) or {}
# Add other formats like INI if needed
except Exception as e:
raise IOError(f"Failed to load config {self.path}: {e}")
return {}
def save(self, config: Dict[str, Any]):
content = ""
try:
if self.type == 'json':
content = json.dumps(config, indent=2)
elif self.type in ['yaml', 'yml']:
content = yaml.dump(config, default_flow_style=False)
# Use our safe writing function
atomic_write(self.path, content)
except Exception as e:
raise IOError(f"Failed to save config {self.path}: {e}")
if __name__ == "__main__":
json_config = ConfigManager("config.yml")
settings = {
"database": {"host": "db.example.com", "port": 5432},
"features": {"new_ui": True, "beta_access": False}
}
print(f"Saving settings to {json_config.path}...")
json_config.save(settings)
loaded = json_config.load()
print("Loaded settings:")
print(yaml.dump(loaded))
assert settings == loaded
10. Handle Archives (ZIP, TAR) Safely
Working with .zip or .tar.gz files is common. This ArchiveManager
uses pathlib
to provide a simple, safe interface to create and extract archives, including crucial checks to prevent path traversal attacks.
import zipfile
import tarfile
from pathlib import Path
class ArchiveManager:
"""A safe and simple interface for handling zip and tar archives."""
def _is_path_safe(self, path_str, target_dir):
target_dir = Path(target_dir).resolve()
resolved_path = (target_dir / path_str).resolve()
return resolved_path.is_relative_to(target_dir)
def extract(self, archive_path, extract_to):
archive_path = Path(archive_path)
extract_to = Path(extract_to)
extract_to.mkdir(parents=True, exist_ok=True)
if archive_path.suffix == '.zip':
with zipfile.ZipFile(archive_path, 'r') as archive:
for member_name in archive.namelist():
if self._is_path_safe(member_name, extract_to):
archive.extract(member_name, extract_to)
else:
print(f"WARNING: Skipped unsafe path in zip: {member_name}")
elif '.tar' in "".join(archive_path.suffixes):
with tarfile.open(archive_path, 'r:*') as archive:
for member in archive.getmembers():
if self._is_path_safe(member.name, extract_to):
archive.extract(member, path=extract_to, set_attrs=False)
else:
print(f"WARNING: Skipped unsafe path in tar: {member.name}")
else:
raise ValueError(f"Unsupported archive type: {archive_path.suffix}")
print(f"Successfully extracted {archive_path.name} to {extract_to}")
def create(self, source_dir, output_path):
source_dir = Path(source_dir)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
if output_path.suffix == '.zip':
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as archive:
for path in source_dir.rglob("*"):
archive.write(path, path.relative_to(source_dir))
elif output_path.name.endswith('.tar.gz'):
with tarfile.open(output_path, 'w:gz') as archive:
archive.add(source_dir, arcname='.')
else:
raise ValueError(f"Unsupported archive type: {output_path.suffix}")
print(f"Successfully created archive {output_path}")
if __name__ == "__main__":
project_dir = Path("my_project")
(project_dir / "data").mkdir(parents=True, exist_ok=True)
(project_dir / "main.py").write_text("print('hello')")
manager = ArchiveManager()
archive_file = Path("backups/my_project.tar.gz")
manager.create(project_dir, archive_file)
extract_dir = Path("restored_project")
manager.extract(archive_file, extract_dir)
Final Thoughts
You’ve just explored ten powerful, practical patterns for file system operations using Python’s pathlib
module.
- Embrace
pathlib
: Its object-oriented approach is cleaner, safer, and more expressive than old string-based methods. Make it your default tool for all path manipulations. - Safety First: Always validate external input with
is_relative_to
, write critical files atomically, and extract archives securely. - Readability Counts:
pathlib
code is often self-documenting. A chain of methods on aPath
object is far easier to follow than a series of nested os.path.join calls.
By incorporating these patterns into your work, you’ll write Python code that is not only more professional but also significantly more robust and secure. Happy coding!