Rate Limiting Demystified - Part 5: Advanced Concepts and Industry Practices
Series Navigation:
Index |
Part 1 - Fundamentals |
Part 2 - Algorithms |
Part 3 - Implementation |
Part 4 - Distributed |
Part 6 - Interview Questions
Table of Contents
- Adaptive Rate Limiting
- Cost-Based Rate Limiting
- Rate Limiting Tiers and Quotas
- Real-World Industry Examples
- 25+ Tips and Tricks
- 15 Common Pitfalls
- 12 Anti-Patterns
- Industry Best Practices
- Monitoring and Observability
- Testing Rate Limiters
1. Adaptive Rate Limiting
What Is Adaptive Rate Limiting?
Static rate limits are set once and never change. Adaptive rate limiting adjusts limits
dynamically based on real-time system conditions. The system becomes self-regulating.
Adaptive by System Load
import psutil
import time
class AdaptiveRateLimiter:
"""
Adjusts effective limit based on system CPU and memory utilization.
Reduces limits when system is under stress. Restores when healthy.
"""
def __init__(self, base_limit: int, redis_limiter):
self.base_limit = base_limit
self.redis_limiter = redis_limiter
def get_effective_limit(self) -> int:
cpu_percent = psutil.cpu_percent(interval=0.1)
memory_percent = psutil.virtual_memory().percent
if cpu_percent > 90 or memory_percent > 90:
# Critical: reduce to 20% of normal
return max(1, int(self.base_limit * 0.2))
elif cpu_percent > 75 or memory_percent > 75:
# High load: reduce to 50%
return max(1, int(self.base_limit * 0.5))
elif cpu_percent > 60 or memory_percent > 60:
# Moderate: reduce to 75%
return max(1, int(self.base_limit * 0.75))
else:
# Normal: full limit
return self.base_limit
def is_allowed(self, identifier: str) -> bool:
effective_limit = self.get_effective_limit()
return self.redis_limiter.is_allowed(identifier, limit_override=effective_limit)Adaptive by Error Rate
import collections
import threading
class ErrorRateAdaptiveLimiter:
"""
Reduces rate limits when downstream error rates are high.
Implements a sliding window error tracker.
"""
def __init__(self, base_limit: int):
self.base_limit = base_limit
self.error_window = collections.deque()
self.success_window = collections.deque()
self.lock = threading.Lock()
self.window_size = 60 # 1 minute
def record_response(self, is_error: bool) -> None:
now = time.time()
with self.lock:
window_start = now - self.window_size
# Evict old records
while self.error_window and self.error_window[0] < window_start:
self.error_window.popleft()
while self.success_window and self.success_window[0] < window_start:
self.success_window.popleft()
if is_error:
self.error_window.append(now)
else:
self.success_window.append(now)
def get_effective_limit(self) -> int:
with self.lock:
total = len(self.error_window) + len(self.success_window)
if total == 0:
return self.base_limit
error_rate = len(self.error_window) / total
if error_rate > 0.5: # >50% errors
return max(1, int(self.base_limit * 0.1))
elif error_rate > 0.25: # >25% errors
return max(1, int(self.base_limit * 0.5))
elif error_rate > 0.10: # >10% errors
return max(1, int(self.base_limit * 0.75))
else:
return self.base_limitAdaptive by User Behavior
class BehavioralRateLimiter:
"""
Reduces limits for users showing suspicious patterns.
Increases limits for trusted, verified users.
"""
TRUST_LEVELS = {
"anonymous": 0.2, # 20% of base limit
"new_user": 0.5, # 50% of base limit
"verified": 1.0, # 100% of base limit
"premium": 2.0, # 200% of base limit
"enterprise": 5.0, # 500% of base limit
}
def __init__(self, base_limit: int, redis_limiter):
self.base_limit = base_limit
self.redis_limiter = redis_limiter
def get_limit_for_user(self, user_id: str, trust_level: str) -> int:
multiplier = self.TRUST_LEVELS.get(trust_level, 0.2)
return max(1, int(self.base_limit * multiplier))
def is_allowed(self, user_id: str, trust_level: str) -> bool:
limit = self.get_limit_for_user(user_id, trust_level)
return self.redis_limiter.is_allowed(user_id, limit_override=limit)2. Cost-Based Rate Limiting
The Problem with Uniform Cost
Not all API calls are equally expensive. A simple GET /users/{id} might take 2ms and
query a cache. A POST /reports/generate might take 5 seconds and scan millions of rows.
Giving them the same rate limit token cost is unfair to system resources.
GraphQL Query Complexity Rate Limiting
GitHub's GraphQL API uses a point system. Each query has a cost based on its complexity.
Users have a budget of 5,000 points per hour.
from graphql import parse, DocumentNode
from graphql.language.ast import FieldNode, SelectionSetNode
class GraphQLCostAnalyzer:
"""
Calculates the cost of a GraphQL query for rate limiting purposes.
"""
# Cost per field type
FIELD_COSTS = {
"users": 10, # returns list
"repositories": 5,
"commits": 3,
"viewer": 0, # cheap, returns current user
"_default": 1
}
# Multiplier for list fields (per item)
LIST_MULTIPLIERS = {
"users": 100, # could return up to 100 users
"repositories": 50,
"commits": 30,
}
def calculate_cost(self, query: str) -> int:
try:
document = parse(query)
return self._calculate_document_cost(document)
except Exception:
return 1000 # Unparseable query gets high cost
def _calculate_document_cost(self, document: DocumentNode) -> int:
total_cost = 0
for definition in document.definitions:
if hasattr(definition, "selection_set"):
total_cost += self._calculate_selection_cost(
definition.selection_set, depth=0
)
return max(1, total_cost)
def _calculate_selection_cost(
self, selection_set: SelectionSetNode, depth: int
) -> int:
if not selection_set or depth > 10:
return 0
cost = 0
for selection in selection_set.selections:
if isinstance(selection, FieldNode):
field_name = selection.name.value
field_cost = self.FIELD_COSTS.get(
field_name, self.FIELD_COSTS["_default"]
)
multiplier = self.LIST_MULTIPLIERS.get(field_name, 1)
cost += field_cost * multiplier
if selection.selection_set:
cost += self._calculate_selection_cost(
selection.selection_set, depth + 1
)
return cost
# Usage in FastAPI/Flask middleware
analyzer = GraphQLCostAnalyzer()
def check_graphql_rate_limit(user_id: str, query: str, token_bucket) -> dict:
cost = analyzer.calculate_cost(query)
result = token_bucket.is_allowed(user_id, cost=cost)
result["query_cost"] = cost
return resultREST API Endpoint Weight
# Define weights for different endpoints and methods
ENDPOINT_WEIGHTS = {
("GET", "/api/users"): 1,
("GET", "/api/users/{id}"): 1,
("POST", "/api/users"): 2,
("GET", "/api/reports"): 5,
("POST", "/api/reports/generate"): 50,
("GET", "/api/export"): 25,
("POST", "/api/bulk"): 10, # per item in bulk
"_default": 1
}
def get_request_cost(method: str, path: str, body: dict = None) -> int:
key = (method, path)
cost = ENDPOINT_WEIGHTS.get(key, ENDPOINT_WEIGHTS["_default"])
# Bulk operations: cost multiplied by item count
if path == "/api/bulk" and body and "items" in body:
cost = cost * len(body["items"])
return cost3. Rate Limiting Tiers and Quotas
Tier Design
Production SaaS products use rate limit tiers tied to subscription plans.
from dataclasses import dataclass
from typing import Optional
@dataclass
class RateLimitTier:
name: str
requests_per_second: int
requests_per_minute: int
requests_per_hour: int
requests_per_day: int
requests_per_month: Optional[int]
max_burst: int
concurrent_connections: int
max_response_size_mb: int
TIERS = {
"free": RateLimitTier(
name="free",
requests_per_second=1,
requests_per_minute=30,
requests_per_hour=500,
requests_per_day=2_000,
requests_per_month=50_000,
max_burst=5,
concurrent_connections=2,
max_response_size_mb=1
),
"starter": RateLimitTier(
name="starter",
requests_per_second=10,
requests_per_minute=300,
requests_per_hour=5_000,
requests_per_day=50_000,
requests_per_month=500_000,
max_burst=50,
concurrent_connections=10,
max_response_size_mb=10
),
"pro": RateLimitTier(
name="pro",
requests_per_second=100,
requests_per_minute=3_000,
requests_per_hour=50_000,
requests_per_day=500_000,
requests_per_month=5_000_000,
max_burst=500,
concurrent_connections=50,
max_response_size_mb=50
),
"enterprise": RateLimitTier(
name="enterprise",
requests_per_second=1_000,
requests_per_minute=30_000,
requests_per_hour=500_000,
requests_per_day=None, # Unlimited daily
requests_per_month=None, # Unlimited monthly
max_burst=5_000,
concurrent_connections=500,
max_response_size_mb=100
)
}Enforcing Multiple Limits Per Tier
import redis
import time
class TieredRateLimiter:
SCRIPT = """
local results = {}
for i, key in ipairs(KEYS) do
local limit = tonumber(ARGV[i * 2 - 1])
local window = tonumber(ARGV[i * 2])
local now = tonumber(ARGV[#ARGV])
local window_id = math.floor(now / window)
local full_key = key .. ':' .. window_id
local count = redis.call('INCR', full_key)
if count == 1 then
redis.call('EXPIRE', full_key, window * 2)
end
if count > limit then
return {0, i, limit, count} -- denied at tier i
end
table.insert(results, limit - count)
end
return {1, 0, 0, 0} -- all limits passed
"""
def __init__(self, r: redis.Redis):
self.r = r
self._script = r.register_script(self.SCRIPT)
def is_allowed(self, user_id: str, tier: str) -> dict:
tier_config = TIERS.get(tier, TIERS["free"])
now = int(time.time())
# Check all applicable time-window limits
limits_to_check = [
(tier_config.requests_per_second, 1),
(tier_config.requests_per_minute, 60),
(tier_config.requests_per_hour, 3600),
]
if tier_config.requests_per_day:
limits_to_check.append((tier_config.requests_per_day, 86400))
keys = [f"rl:{tier}:{user_id}:{window}" for _, window in limits_to_check]
args = []
for limit, window in limits_to_check:
args.extend([limit, window])
args.append(now)
result = self._script(keys=keys, args=args)
allowed = bool(int(result[0]))
return {
"allowed": allowed,
"tier": tier,
"denied_at_window": int(result[1]) if not allowed else None
}4. Real-World Industry Examples
4.1 GitHub REST API
Limits:
- Unauthenticated: 60 requests/hour (by IP)
- Authenticated: 5,000 requests/hour (by user token)
- GitHub Apps: 5,000 requests/hour per installation
- Enterprise: 15,000 requests/hour
Algorithm: Fixed window (per hour, resets at the top of the hour)
Headers:
X-RateLimit-Limit: 5000
X-RateLimit-Remaining: 4987
X-RateLimit-Used: 13
X-RateLimit-Reset: 1735689600
X-RateLimit-Resource: core
Key design insight: GitHub uses separate rate limit pools per resource type:
core: Regular APIsearch: Search API (30 RPM authenticated, 10 unauthenticated)graphql: 5,000 points/hourcode_scanning_upload: Separate limit
4.2 GitHub GraphQL API
Limits: 5,000 points per hour
Cost calculation:
Each connection field that returns up to N items costs N/100 points (minimum 1)
Single item fields: 1 point
Mutations: 1 point
Example query cost:
{
viewer {
# 1 point
repositories(first: 100) {
# 100/100 = 1 point... actually more complex
nodes {
name
issues(first: 50) {
# adds to cost
nodes {
title
}
}
}
}
}
}
# Total: ~51 pointsResponse header:
X-RateLimit-Cost: 51
X-RateLimit-Remaining: 4949
4.3 Twitter/X API v2
Free tier:
- Read: 1 request/15 min per app (essentially read-disabled)
- Write: 1,500 Tweets/month
Basic ($100/month):
- Read: 10,000 requests/month
- Write: 3,000 Tweets/month
Pro ($5,000/month):
- Read: 1,000,000 requests/month
- Write: 300,000 Tweets/month
Algorithm: Fixed window (15-minute windows for most endpoints)
Key lesson: Twitter's dramatic 2023 API pricing change taught developers to:
- Never build critical infrastructure on third-party APIs without fallback
- Cache aggressively - every API call costs money
- Respect rate limits at the SDK level
Response structure:
{
"title": "Too Many Requests",
"detail": "Too Many Requests",
"type": "about:blank",
"status": 429
}Headers:
x-rate-limit-limit: 15
x-rate-limit-remaining: 0
x-rate-limit-reset: 1735689600
4.4 Stripe API
Limits:
- Live mode: 100 requests/second (burst of 1,000)
- Test mode: No published limit (higher than live)
Algorithm: Token bucket (Stripe uses GCRA via their open-source Throttled library)
Key Stripe behaviors:
- Limits are per-account, not per-key. Multiple API keys share one account's limit.
- Idempotency keys allow safe retries without double-charging.
- Retry-After header tells exactly when to retry.
import stripe
import time
stripe.api_key = "sk_live_..."
def create_charge_with_retry(amount: int, customer_id: str) -> stripe.Charge:
max_retries = 3
for attempt in range(max_retries):
try:
return stripe.Charge.create(
amount=amount,
currency="usd",
customer=customer_id,
idempotency_key=f"charge_{customer_id}_{amount}_{int(time.time() // 60)}"
# idempotency key changes every minute - safe to retry within same minute
)
except stripe.error.RateLimitError as e:
retry_after = int(e.headers.get("Retry-After", 60))
if attempt < max_retries - 1:
time.sleep(retry_after)
else:
raise4.5 Cloudflare Rate Limiting
How Cloudflare does it:
- Uses sliding window counter algorithm (Cloudflare engineering blog, 2022)
- Implemented in Rust for performance
- Sub-millisecond rate limit decisions
- Scales to millions of rules without performance degradation
Cloudflare Rate Limiting Rule example:
Rule: "Login endpoint protection"
When: URI path contains /api/auth/login
Rate limit: 5 requests per 10 seconds per IP
Action: Block for 1 hour
Mitigation timeout: 3600 seconds
Key insight from Cloudflare: They rate limit in the data plane (Nginx/Rust workers)
BEFORE requests even reach any application code. This means blocked requests cost almost
nothing server-side.
4.6 AWS API Gateway
Throttle types:
- Account-level: Default 10,000 RPS, burst of 5,000
- Stage-level: Per API deployment stage
- Usage plan: Per API key (monetization)
- Method-level: Per HTTP method and resource path
Token bucket implementation:
- Steady-state rate: tokens refilled at
rateRPS - Burst: allows up to
bursttokens to be consumed instantly
Key AWS difference: AWS rate limits return 429 with no body by default.
You configure a Gateway Response to customize the 429 response.
5. 25+ Tips and Tricks
Tip 1: Always Set Rate Limit Headers on Every Response
Even when the request is allowed, include headers. Clients use them to proactively throttle.
Not setting them means clients only discover their limit after they hit it.
# Bad: Only set headers on 429
if not allowed:
response.headers["X-RateLimit-Remaining"] = 0
# Good: Always set headers
response.headers["X-RateLimit-Limit"] = limit
response.headers["X-RateLimit-Remaining"] = remaining
response.headers["X-RateLimit-Reset"] = reset_atTip 2: Return Machine-Readable Retry-After
Clients should not have to calculate when to retry. Give them exact seconds.
# Less useful: Unix timestamp (client must calculate delta)
response.headers["X-RateLimit-Reset"] = reset_timestamp
# More useful for retry: seconds to wait
response.headers["Retry-After"] = str(seconds_until_reset)
# Include BOTH for maximum compatibilityTip 3: Hash Sensitive Identifiers
Never store raw API keys or user credentials as Redis keys. Hash them.
import hashlib
def make_rate_limit_key(api_key: str) -> str:
# Hash the API key: Redis key contains only a hash, not the key itself
key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:16]
return f"rl:apikey:{key_hash}"Tip 4: Use Binary IP Addresses in Redis Keys
IPv4 is 4 bytes. IPv6 is 16 bytes. Storing the string form wastes memory.
Nginx's $binary_remote_addr saves ~11 bytes per IPv4 key.
Tip 5: Warm Up Your Rate Limits
New users should get a warm-up period. Starting at 0% of their limit and ramping to
100% over their first N requests prevents false impressions of abuse from bots.
def get_warmup_limit(user_id: str, base_limit: int) -> int:
user_age_days = get_user_age_days(user_id)
if user_age_days < 1: return int(base_limit * 0.2)
if user_age_days < 7: return int(base_limit * 0.5)
if user_age_days < 30: return int(base_limit * 0.8)
return base_limitTip 6: Different Windows for Different Threat Models
Login endpoints need per-second AND per-minute AND per-hour limits:
- Per-second: Prevents rapid automated attacks
- Per-minute: Catches slower automated attacks
- Per-hour: Rate limits persistent attackers with backoff strategies
Tip 7: Test with "Fake" Time in Unit Tests
# Bad: Tests depend on real time, are flaky
def test_rate_limit():
limiter = RateLimiter(limit=5, window=60)
for i in range(5):
assert limiter.is_allowed("user") # passes
assert not limiter.is_allowed("user") # passes
time.sleep(61) # SLOW! Don't do this in unit tests
assert limiter.is_allowed("user") # passes
# Good: Inject clock for deterministic tests
from unittest.mock import patch
def test_rate_limit_with_mock_time():
limiter = RateLimiter(limit=5, window=60)
with patch("time.time") as mock_time:
mock_time.return_value = 1000.0
for i in range(5):
assert limiter.is_allowed("user")
assert not limiter.is_allowed("user")
mock_time.return_value = 1061.0 # Advance 61 seconds
assert limiter.is_allowed("user") # Window resetTip 8: Use Separate Redis DBs or Namespaces
Rate limit keys should not share space with application data. Use a separate Redis DB
(db=1 for rate limiting, db=0 for app data) or a clear key prefix.
Tip 9: Monitor Key Space Size
Rate limit keys accumulate. Monitor Redis DBSIZE and memory usage. Set maxmemory and
a sensible eviction policy (allkeys-lru or volatile-lru).
Tip 10: Implement a "Dry Run" Mode
Before enforcing new rate limits, run them in "shadow mode" - check limits but never
actually reject. Log what would have been rejected. Review logs before enabling enforcement.
DRY_RUN = os.getenv("RATE_LIMIT_DRY_RUN", "false") == "true"
def check_rate_limit(identifier, limit, window) -> bool:
result = redis_limiter.is_allowed(identifier, limit, window)
if not result["allowed"]:
if DRY_RUN:
logger.warning(f"[DRY RUN] Would have rejected {identifier} "
f"(count={result['count']}, limit={limit})")
return True # Allow anyway in dry run
return False
return TrueTip 11: Document Your Rate Limits
Rate limits MUST be documented. Include:
- Limits for each tier
- Which endpoints have custom limits
- What headers are returned
- What to do when rate limited
- How to request higher limits
Tip 12: Rate Limit by Resource, Not Just by User
A user making 100 requests to GET /products is different from making 100 requests to
POST /payments. Resource-aware rate limiting is more accurate.
Key: "rl:{user_id}:{resource_type}:{window}"
Examples:
"rl:user123:payments:1735689600"
"rl:user123:reports:1735689600"
"rl:user123:api:1735689600"
Tip 13: Use Exponential Backoff on the Server Side Too
When your own service calls Redis and gets a timeout, don't retry immediately. Use
exponential backoff with a short max wait.
Tip 14: Rate Limit Webhooks and Callbacks
Outgoing webhooks from your system to customer endpoints are a form of outbound traffic
that should also be rate limited. Customers' servers can be slow or unavailable.
Tip 15: Log Rate Limit Events
Every 429 response should be logged with:
- Identifier (user/IP/key)
- Endpoint
- Current count vs limit
- Timestamp
This enables abuse investigation and limit tuning.
Tip 16: Provide a Quota Usage API
Let users query their current usage. This is standard for developer-facing APIs:
GET /api/rate-limit-status
{
"limit": 5000,
"used": 247,
"remaining": 4753,
"reset_at": "2025-01-01T00:00:00Z",
"tier": "pro"
}
Tip 17: Add Grace Tokens for Established Users
Give users 10-20% extra tokens if they have been using your API for >1 year. This
rewards loyalty and reduces support tickets from accidental limit breaches.
Tip 18: Use Different TTLs for Window Cleanup
Always set the Redis TTL to 2x the window size, not 1x. This ensures the previous
window key is still available for sliding window counter calculations.
Tip 19: Implement Per-IP Limits in Addition to Per-User
Even if a user is authenticated, their IP may be compromised. Layer both:
- Per-IP limit (coarser, protects against compromised credentials)
- Per-user limit (finer, enforces subscription limits)
Tip 20: Rate Limit OPTIONS Requests Separately or Exempt Them
CORS preflight (OPTIONS) requests double your API call count but are browser-generated.
Either exempt them or count them at a much higher limit.
Tip 21: Propagate Rate Limit Context in Microservices
When Service A calls Service B on behalf of User X, pass the user context so Service B
can contribute to the same user rate limit bucket.
# Pass user rate limit context in request headers between microservices
headers = {
"X-User-Id": user_id,
"X-Rate-Limit-Remaining": str(remaining),
"X-Request-Cost": str(cost)
}
requests.get("http://service-b/internal/data", headers=headers)Tip 22: Use Redis OBJECT ENCODING to Understand Memory Usage
# Check how Redis is storing a rate limit key
OBJECT ENCODING rl:fw:user123:28956
# Returns: "int" (very efficient) or "embstr" or "raw"Tip 23: Consider Time Zone for Daily Limits
Daily limits that reset at UTC midnight are inconvenient for users in timezones far from UTC.
Consider resetting at the user's local midnight, or use a rolling 24-hour window instead.
Tip 24: Precompute Rate Limit Keys
Instead of computing the current window key on every request, precompute it once at
request entry and reuse it across multiple checks.
Tip 25: Rate Limit Background Jobs Separately
Background jobs (cron jobs, queue workers) should have their own rate limit quotas,
separate from user-facing API requests. Otherwise, a busy background job can eat the
user's quota.
6. 15 Common Pitfalls
Pitfall 1: Using Real Client IP Instead of Forwarded IP
# WRONG: This is your proxy/load balancer's IP
client_ip = request.remote_addr # "10.0.0.5" (internal proxy)
# CORRECT: Get the real client IP from forwarded headers
def get_real_ip(request) -> str:
forwarded_for = request.headers.get("X-Forwarded-For")
if forwarded_for:
# "X-Forwarded-For: client, proxy1, proxy2" - take the first
return forwarded_for.split(",")[0].strip()
return request.headers.get("X-Real-IP", request.remote_addr)Warning: X-Forwarded-For can be spoofed by clients. Trust it only if you control
the proxy that sets it. Some proxies (like AWS ALB) can be configured to overwrite, not append.
Pitfall 2: Not Setting Key Expiry (Memory Leak)
# WRONG: Key lives forever
redis.incr(f"rl:{user_id}")
# CORRECT: Always set expiry
redis.incr(f"rl:{user_id}")
redis.expire(f"rl:{user_id}", window_seconds)
# Or use SET with EX option atomicallyPitfall 3: Using Non-Atomic INCR + EXPIRE
The INCR and EXPIRE are two separate commands. Between them, the process can crash,
leaving a key with no TTL.
# SAFER: Use pipeline or Lua
pipe = redis.pipeline()
pipe.incr(key)
pipe.expire(key, window)
pipe.execute()Pitfall 4: Single Redis Instance as a Single Point of Failure
A rate limiter using a single Redis instance will fail (and the fail-open strategy will
allow unlimited traffic) if that Redis instance goes down.
Fix: Redis Sentinel for high availability, or Redis Cluster for both HA and scale.
Pitfall 5: The Thundering Herd After a Rate Limit Clears
When a rate limit resets (e.g., at the top of the minute), all users who were blocked
rush to make requests simultaneously. This creates a huge spike.
Fix: Add jitter to reset times. Instead of all windows resetting at :00 seconds,
each user's window resets at a random second offset.
import hashlib
def get_jittered_window_start(user_id: str, window_seconds: int) -> int:
# Deterministic but distributed: each user has a unique window offset
user_hash = int(hashlib.sha256(user_id.encode()).hexdigest(), 16)
offset = user_hash % window_seconds
now = int(time.time())
window_base = now - (now % window_seconds)
return window_base + offset - window_seconds # previous window startPitfall 6: Rate Limiting by User ID Without Authentication
If rate limiting by user ID without verifying the token, any client can claim to be
any user and avoid limits by cycling through fake user IDs.
Fix: Always authenticate first. Rate limit only on verified identities.
Pitfall 7: Not Rate Limiting Internal/Admin APIs
Developers often rate limit only public-facing APIs. Internal admin APIs are often
equally vulnerable to abuse and mistakes. A buggy admin script can DDoS your own database.
Pitfall 8: Setting Limits Too Low During Development
Rate limits configured for production capacity are too low in development environments
where test scripts run at full speed. Use different limits per environment.
Pitfall 9: Counting Cached Responses Toward Rate Limits
If your API serves cached responses, the cost to your system is near zero. Counting
cached responses toward rate limits unfairly penalizes users for cheap requests.
Pitfall 10: Not Testing Rate Limit Edge Cases
Common untested scenarios:
- What happens at exactly the limit (100th request)?
- What happens when the window resets?
- What happens when Redis is unavailable?
- What happens with concurrent requests that race the limit?
Pitfall 11: Inconsistent Clock Sources
If some servers get time from NTP server A and others from NTP server B, and those
servers disagree by even 1 second, your fixed-window rate limiting will have gaps
or overlaps at window boundaries.
Fix: Use the same NTP server for all instances, or pass time as a parameter to
your rate limiting function and use the Redis server time (TIME command).
Pitfall 12: Allowing Too Large a Burst
A token bucket with capacity=10,000 and rate=100/min means a user can send 10,000
requests instantly if they've been idle. This "burst" can overwhelm downstream services
even if the average rate is within limits.
Fix: Set burst capacity to a reasonable multiple of the per-minute rate (e.g., 2-5x),
not 100x.
Pitfall 13: Rate Limiting After Heavy Business Logic
Some implementations run expensive business logic THEN check rate limits. This wastes
CPU on requests that should have been rejected immediately.
Fix: Always check rate limits FIRST, before any business logic.
Pitfall 14: Not Providing Rate Limit Status Endpoint
Without a status endpoint, developers cannot check their current usage without making
a real API call. Add a dedicated, cheap endpoint that returns rate limit status.
Pitfall 15: Ignoring Rate Limiting in API Documentation
Rate limits that are not documented cause endless support tickets. Every API reference
page must clearly state the limits for each endpoint and tier.
7. 12 Anti-Patterns
Anti-Pattern 1: "The Sleep Loop"
Name: Sleep-based retry
What it looks like:
# WRONG: Naive sleep loop
while True:
if not rate_limiter.is_allowed(user_id):
time.sleep(1) # Sleep 1 second, then try again
continue
# do workWhy it's wrong: Sleep is blocking. Under load, this holds threads. It also causes
thundering herd when all sleepers wake at the same time.
Fix: Use async/await with exponential backoff and jitter.
Anti-Pattern 2: "The Optimistic Incrementer"
Name: Read-check-write without atomicity
What it looks like:
# WRONG: Non-atomic check-then-increment
count = int(redis.get(key) or 0)
if count < limit:
redis.set(key, count + 1) # Race condition here
return True
return FalseWhy it's wrong: Two threads can both read count < limit and both increment.
Fix: Use INCR or a Lua script.
Anti-Pattern 3: "The Global Counter"
Name: Single shared counter for all users
What it looks like:
# WRONG: One counter for the entire system
if redis.incr("global:counter") > 10000:
return 429Why it's wrong: One heavy user can block ALL users. No per-user fairness.
Fix: Per-user, per-IP, or per-API-key counters.
Anti-Pattern 4: "The Memory Sponge"
Name: Sliding window log without bounds
What it looks like:
# WRONG: Unbounded log grows forever
user_log = all_logs[user_id]
user_log.append(time.time()) # Never evict old entriesWhy it's wrong: Memory grows unbounded. With 1M users and 1K requests/min each,
you need hundreds of GB just for the logs.
Fix: Use sliding window counter (O(1) memory) or bounded deque.
Anti-Pattern 5: "The Ignored Header"
Name: Not reading Retry-After
What it looks like:
# WRONG: Ignore Retry-After, use fixed delay
except RateLimitError:
time.sleep(5) # Always wait 5 seconds, ignoring Retry-After
retry()Why it's wrong: The server says exactly when the limit resets. Ignoring it means
retrying too early (getting another 429) or waiting too long (wasting time).
Fix: Always read and respect the Retry-After header.
Anti-Pattern 6: "The Endpoint Exemption"
Name: Exempting internal or "trusted" endpoints from rate limiting
What it looks like:
# WRONG: Internal network requests skip rate limiting
if request.remote_addr.startswith("10."):
return # Skip rate limit for "internal" requestsWhy it's wrong: Internal services can bug out too. A runaway job from an internal
IP can DDoS your own infrastructure just as badly as external traffic.
Fix: Apply rate limits to all traffic. Use higher limits for internal callers if needed.
Anti-Pattern 7: "The Cascading Failure Amplifier"
Name: Retrying without backoff on 429
What it looks like:
# WRONG: Immediate retry loop
for attempt in range(100):
response = api.call()
if response.status_code == 429:
continue # Retry immediately - amplifies the problem!Why it's wrong: Immediate retries under rate limiting make things worse. All clients
get 429, all retry immediately, all get 429 again. Classic thundering herd.
Fix: Exponential backoff with jitter.
Anti-Pattern 8: "The Authentication Bypass"
Name: Rate limiting after authentication failure
What it looks like:
def login(username, password):
user = authenticate(username, password)
if user:
# Only rate limit successful logins?
check_rate_limit(user.id)
return generate_token(user)
return 401Why it's wrong: Failed login attempts are the exact ones you need to rate limit!
Brute force attacks use failed attempts.
Fix: Rate limit BEFORE authentication. Key by username or IP.
Anti-Pattern 9: "The Silent Drop"
Name: Dropping requests without any response
What it looks like:
# WRONG: Drop request silently, return nothing or close connection
if rate_limited:
return # No response, connection just closesWhy it's wrong: Clients have no idea what happened. They will retry immediately.
Fix: Always return a proper 429 with Retry-After.
Anti-Pattern 10: "The Undocumented Limit"
Name: Enforcing undocumented rate limits
What it looks like: Implementing rate limits but not publishing them in API docs.
Why it's wrong: Developers building on your API get mysterious 429s with no explanation.
This creates support tickets and frustration.
Fix: Document every limit on your API documentation page.
Anti-Pattern 11: "The Magic Number Limit"
Name: Setting rate limits based on guesswork without load testing
What it looks like: "I'll set it to 1,000 RPM because that sounds reasonable."
Why it's wrong: Without measuring actual capacity, you might set limits too high
(allowing overload) or too low (unnecessarily blocking legitimate users).
Fix: Load test your system, measure peak capacity, set limit at 70-80% of capacity.
Anti-Pattern 12: "The Same Limit Everywhere"
Name: Uniform rate limits for all endpoints
What it looks like:
# WRONG: Same limit for all endpoints regardless of cost
@rate_limit(limit=100, window=60) # 100/min applied to everything
def all_endpoints():
...Why it's wrong: A GET /products and a POST /bulk-export should NOT have the
same limit. The export is 100x more expensive.
Fix: Endpoint-specific limits based on actual measured cost.
8. Industry Best Practices
8.1 Design Principles
- Fail gracefully: Always have a fallback when the rate limiter fails
- Be transparent: Always return rate limit headers on every response
- Be predictable: Same input should produce same output; avoid arbitrary behavior
- Be documented: Rate limits MUST be in your API docs
- Start conservative: It is easier to loosen limits than to tighten them after users depend on higher limits
8.2 Rate Limit Values by Category
Based on industry averages across major APIs:
| API Type | Suggested Starting Limit |
|---|---|
| Public REST API (free tier) | 60-100 req/hour |
| Public REST API (paid tier) | 1,000-5,000 req/min |
| Authentication endpoint | 5-10 req/min per username/IP |
| Search endpoint | 30-100 req/min |
| Export/report endpoint | 2-5 req/hour |
| Webhook delivery | 10-50 req/min per customer endpoint |
| Internal service API | 1,000-10,000 req/sec |
8.3 RFC Compliance
Follow these RFCs for HTTP rate limiting:
- RFC 6585: HTTP 429 status code
- RFC 7231: HTTP/1.1 semantics (Retry-After header format)
- draft-ietf-httpapi-ratelimit-headers: Standard RateLimit-* headers (adopt when stable)
8.4 API Design Checklist
[ ] Rate limits defined for each endpoint and tier
[ ] Headers returned on every response (200 and 429)
[ ] Retry-After on every 429 response
[ ] Machine-readable error response (JSON with error code)
[ ] Rate limit status endpoint (/rate-limit-status or in /health)
[ ] Documentation updated
[ ] Monitoring alerts configured
[ ] Tests written (unit + integration + load)
[ ] Logs include rate limit events
[ ] Fail-open/fail-closed policy defined and implemented
[ ] Redis key naming convention defined
[ ] Key expiry (TTL) set on all rate limit keys
[ ] Lua scripts used for atomic multi-step operations
9. Monitoring and Observability
Key Metrics to Track
# Metrics to expose (Prometheus format)
rate_limit_requests_total{endpoint, user_tier, result} # allowed vs denied
rate_limit_utilization{user_id, tier} # % of limit used
rate_limit_denied_rate # 429s per second
rate_limit_redis_latency_p99 # Redis round trip
rate_limit_top_consumers # Top 10 heaviest usersAlerting Rules
# Prometheus alerting rules
groups:
- name: rate_limiting
rules:
- alert: HighDenyRate
expr: rate(rate_limit_denied_total[5m]) > 100
for: 2m
annotations:
summary: "More than 100 rate limit denials per second for 2 minutes"
- alert: RedisRateLimitDown
expr: up{job="redis-rate-limit"} == 0
for: 30s
annotations:
summary: "Redis rate limiter is down - fail-open mode active"
- alert: SuspiciousHighUser
expr: rate_limit_utilization > 0.95
for: 5m
annotations:
summary: "User consistently near rate limit - potential abuse"Logging Best Practices
import structlog
logger = structlog.get_logger()
def log_rate_limit_event(
identifier: str,
endpoint: str,
allowed: bool,
count: int,
limit: int
) -> None:
log_fn = logger.info if allowed else logger.warning
log_fn(
"rate_limit_check",
identifier=identifier,
endpoint=endpoint,
allowed=allowed,
count=count,
limit=limit,
utilization=round(count / limit, 3),
event_type="rate_limit_denied" if not allowed else "rate_limit_allowed"
)10. Testing Rate Limiters
Unit Tests
import pytest
from unittest.mock import patch, MagicMock
import time
class TestRateLimiter:
def test_allows_requests_within_limit(self):
limiter = RateLimiter(limit=5, window=60)
for i in range(5):
assert limiter.is_allowed("user1"), f"Request {i+1} should be allowed"
def test_denies_requests_over_limit(self):
limiter = RateLimiter(limit=5, window=60)
for _ in range(5):
limiter.is_allowed("user1")
assert not limiter.is_allowed("user1"), "6th request should be denied"
def test_different_users_have_independent_limits(self):
limiter = RateLimiter(limit=5, window=60)
for _ in range(5):
limiter.is_allowed("user1") # Exhaust user1's limit
assert limiter.is_allowed("user2"), "User2 should not be affected"
@patch("time.time")
def test_window_resets_after_expiry(self, mock_time):
mock_time.return_value = 1000.0
limiter = RateLimiter(limit=5, window=60)
for _ in range(5):
limiter.is_allowed("user1")
assert not limiter.is_allowed("user1")
mock_time.return_value = 1061.0 # Advance past window
assert limiter.is_allowed("user1"), "Should allow after window reset"
def test_returns_correct_headers(self):
limiter = RateLimiter(limit=5, window=60)
result = limiter.check("user1")
assert result["limit"] <mark class="obsidian-highlight"> 5
assert result["remaining"] </mark> 4
assert "reset_at" in result
def test_concurrent_requests_respect_limit(self):
import threading
limiter = RateLimiter(limit=10, window=60)
allowed_count = [0]
lock = threading.Lock()
def make_request():
if limiter.is_allowed("user1"):
with lock:
allowed_count[0] += 1
threads = [threading.Thread(target=make_request) for _ in range(20)]
for t in threads: t.start()
for t in threads: t.join()
assert allowed_count[0] <= 10, f"Expected <= 10 allowed, got {allowed_count[0]}"Integration Tests with Redis
import pytest
import redis as redis_lib
import fakeredis # pip install fakeredis
@pytest.fixture
def fake_redis():
"""Use fakeredis for fast, isolated tests without a real Redis instance."""
return fakeredis.FakeRedis(decode_responses=True)
@pytest.fixture
def redis_limiter(fake_redis):
return RedisRateLimiter(
redis_client=fake_redis,
limit=10,
window_seconds=60
)
class TestRedisRateLimiter:
def test_lua_script_atomicity(self, redis_limiter):
"""Verify Lua script prevents race conditions."""
import concurrent.futures
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [
executor.submit(redis_limiter.is_allowed, "user1")
for _ in range(20)
]
results = [f.result()["allowed"] for f in futures]
allowed = sum(1 for r in results if r)
assert allowed <= 10, f"Expected at most 10 allowed, got {allowed}"Load Testing
# Using wrk to load test rate limiting behavior
wrk -t 10 -c 100 -d 30s \
-H "X-API-Key: test_key_123" \
http://localhost:8080/api/data
# Expected result: first N requests succeed (200), rest return 429
# Check rate of 429s in outputSummary
| Topic | Key Takeaway |
|---|---|
| Adaptive rate limiting | Adjust limits based on system health, user trust, error rates |
| Cost-based limiting | GraphQL/expensive ops consume more tokens |
| Tiered limits | Free/Starter/Pro/Enterprise with compound limits |
| GitHub | Fixed window, separate pools per resource type |
| Twitter/X | Fixed window per 15 min, expensive API pricing after 2023 |
| Stripe | Token bucket (GCRA), idempotency keys for safe retries |
| Cloudflare | Sliding window counter, sub-ms decisions in Rust |
| AWS API Gateway | Token bucket, per-stage and per-key limits |
| Top tips | Headers on every response, hash keys, warmup, dry run mode |
| Top pitfalls | Wrong IP, no TTL, no atomicity, ignoring Retry-After |
| Anti-patterns | Sleep loops, global counter, silent drops, undocumented limits |
| Monitoring | Deny rate, utilization, Redis latency, top consumers |
Next: Part 6 - Interview Questions
80+ interview questions organized by frequency and difficulty - from conceptual to system design
to tricky edge cases seen in 2024-2026 interviews at top tech companies.