Error Recovery
When things go wrong, recover gracefully - retries, fallbacks, and graceful degradation.
7 min read
Beyond Catching Errors#
Catching errors is step one. Recovering from them is what separates amateur from production code.
javascript
// Amateur: Catch and crash
try {
await fetchData();
} catch (error) {
throw new Error('Failed to fetch');
}
// Production: Catch and recover
try {
await fetchData();
} catch (error) {
logger.warn('Primary fetch failed, trying backup');
return fetchFromBackup(); // Recovery strategy
}
Retry Patterns#
Simple Retry#
javascript
async function retry(fn, retries = 3, delay = 1000) {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
return await fn();
} catch (error) {
if (attempt === retries) throw error;
logger.warn(`Attempt ${attempt} failed, retrying in ${delay}ms`);
await sleep(delay);
}
}
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
// Usage
const data = await retry(() => fetchFromAPI(), 3, 1000);
Exponential Backoff#
Better for rate-limited APIs:
javascript
async function retryWithBackoff(fn, options = {}) {
const {
retries = 3,
initialDelay = 1000,
maxDelay = 30000,
factor = 2,
shouldRetry = () => true,
} = options;
let delay = initialDelay;
for (let attempt = 1; attempt <= retries; attempt++) {
try {
return await fn();
} catch (error) {
if (attempt === retries || !shouldRetry(error)) {
throw error;
}
logger.warn({
attempt,
delay,
error: error.message,
}, 'Retrying after failure');
await sleep(delay);
delay = Math.min(delay * factor, maxDelay);
}
}
}
// Usage
const data = await retryWithBackoff(
() => fetchFromAPI(),
{
retries: 5,
initialDelay: 1000,
maxDelay: 30000,
shouldRetry: (error) => error.status === 429 || error.status >= 500,
}
);
Using async-retry Library#
bash
npm install async-retry
javascript
import retry from 'async-retry';
const result = await retry(
async (bail, attemptNumber) => {
const response = await fetch('https://api.example.com/data');
if (response.status === 404) {
bail(new Error('Not found')); // Don't retry 404s
return;
}
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
return response.json();
},
{
retries: 5,
factor: 2,
minTimeout: 1000,
maxTimeout: 30000,
onRetry: (error, attempt) => {
logger.warn({ error, attempt }, 'Retrying');
},
}
);
Fallback Patterns#
Cache Fallback#
javascript
async function getUserWithFallback(id) {
try {
// Try fresh data first
const user = await userService.findById(id);
await cache.set(`user:${id}`, user);
return user;
} catch (error) {
logger.warn('Database failed, checking cache');
// Fall back to cached data
const cached = await cache.get(`user:${id}`);
if (cached) {
return { ...cached, _stale: true };
}
throw error; // No fallback available
}
}
Default Value Fallback#
javascript
async function getConfig(key) {
const defaults = {
maxUploadSize: 10 * 1024 * 1024,
sessionTimeout: 3600,
features: { darkMode: true },
};
try {
const config = await configService.get(key);
return config ?? defaults[key];
} catch (error) {
logger.warn(`Config fetch failed for ${key}, using default`);
return defaults[key];
}
}
Service Fallback#
javascript
async function sendNotification(userId, message) {
// Try primary service
try {
await pushNotificationService.send(userId, message);
return { method: 'push', success: true };
} catch (error) {
logger.warn('Push notification failed');
}
// Fall back to email
try {
const user = await User.findById(userId);
await emailService.send(user.email, 'Notification', message);
return { method: 'email', success: true };
} catch (error) {
logger.warn('Email notification failed');
}
// Fall back to SMS
try {
const user = await User.findById(userId);
await smsService.send(user.phone, message);
return { method: 'sms', success: true };
} catch (error) {
logger.error('All notification methods failed');
throw new Error('Unable to send notification');
}
}
Circuit Breaker Pattern#
Prevent cascading failures by "breaking the circuit" when a service is down:
bash
npm install opossum
javascript
import CircuitBreaker from 'opossum';
const options = {
timeout: 3000, // If function takes longer, trigger failure
errorThresholdPercentage: 50, // Open circuit if 50% fail
resetTimeout: 30000, // Try again after 30 seconds
};
const breaker = new CircuitBreaker(fetchFromExternalAPI, options);
// Events
breaker.on('success', (result) => logger.info('API call succeeded'));
breaker.on('failure', (error) => logger.warn('API call failed'));
breaker.on('open', () => logger.error('Circuit opened - too many failures'));
breaker.on('halfOpen', () => logger.info('Circuit half-open - testing'));
breaker.on('close', () => logger.info('Circuit closed - recovered'));
// Fallback when circuit is open
breaker.fallback(() => {
return { data: [], _fallback: true };
});
// Usage
const data = await breaker.fire();
Graceful Degradation#
When parts fail, keep the rest working:
javascript
async function getDashboardData(userId) {
const results = await Promise.allSettled([
userService.getProfile(userId),
orderService.getRecentOrders(userId),
notificationService.getUnread(userId),
analyticsService.getUserStats(userId),
]);
return {
profile: results[0].status === 'fulfilled'
? results[0].value
: { error: 'Unable to load profile' },
orders: results[1].status === 'fulfilled'
? results[1].value
: { error: 'Unable to load orders', data: [] },
notifications: results[2].status === 'fulfilled'
? results[2].value
: { error: 'Unable to load notifications', count: 0 },
stats: results[3].status === 'fulfilled'
? results[3].value
: null, // Optional - just hide if unavailable
};
}
Timeout Patterns#
Simple Timeout#
javascript
function timeout(promise, ms) {
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('Timeout')), ms);
});
return Promise.race([promise, timeoutPromise]);
}
// Usage
try {
const data = await timeout(fetchData(), 5000);
} catch (error) {
if (error.message === 'Timeout') {
// Handle timeout specifically
}
}
AbortController (Cancellable)#
javascript
async function fetchWithTimeout(url, timeoutMs = 5000) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(url, {
signal: controller.signal,
});
return response.json();
} finally {
clearTimeout(timeoutId);
}
}
Error Recovery in Database Operations#
Transaction Rollback#
javascript
async function transferFunds(fromId, toId, amount) {
const session = await mongoose.startSession();
session.startTransaction();
try {
await Account.updateOne(
{ _id: fromId },
{ $inc: { balance: -amount } },
{ session }
);
await Account.updateOne(
{ _id: toId },
{ $inc: { balance: amount } },
{ session }
);
await session.commitTransaction();
return { success: true };
} catch (error) {
await session.abortTransaction();
logger.error('Transfer failed, rolled back', { error, fromId, toId, amount });
throw error;
} finally {
session.endSession();
}
}
Idempotent Operations#
javascript
async function processPayment(orderId, paymentData) {
// Check if already processed (idempotency)
const existing = await Payment.findOne({
orderId,
idempotencyKey: paymentData.idempotencyKey,
});
if (existing) {
logger.info('Payment already processed, returning existing');
return existing;
}
// Process new payment
const payment = await Payment.create({
orderId,
idempotencyKey: paymentData.idempotencyKey,
status: 'pending',
});
try {
const result = await paymentGateway.charge(paymentData);
payment.status = 'completed';
payment.transactionId = result.id;
await payment.save();
return payment;
} catch (error) {
payment.status = 'failed';
payment.error = error.message;
await payment.save();
throw error;
}
}
Monitoring Recovery#
Track how often recovery happens:
javascript
import { metrics } from '../utils/metrics.js';
async function fetchWithRecovery(url) {
try {
const data = await fetch(url);
metrics.apiCallsTotal.inc({ status: 'success', source: 'primary' });
return data;
} catch (error) {
metrics.apiCallsTotal.inc({ status: 'failure', source: 'primary' });
try {
const cached = await cache.get(url);
metrics.apiCallsTotal.inc({ status: 'success', source: 'cache_fallback' });
return cached;
} catch {
metrics.apiCallsTotal.inc({ status: 'failure', source: 'cache_fallback' });
throw error;
}
}
}
Key Takeaways#
- Retry transient failures - Network blips, temporary overload
- Use exponential backoff - Don't hammer failing services
- Have fallbacks ready - Cache, defaults, alternative services
- Circuit breakers prevent cascades - Fail fast when services are down
- Degrade gracefully - Partial data is better than no data
- Make operations idempotent - Safe to retry
The Philosophy
Hope for the best, plan for the worst. Every external call can fail. Every database query can timeout. Design for failure, and your users will never know.
Continue Learning
Ready to level up your skills?
Explore more guides and tutorials to deepen your understanding and become a better developer.