Immediate Actions: Production Readiness
This document covers the immediate implementation tasks to address scaling concerns and close HIPAA/GDPR compliance gaps identified in the architecture review.
1. Connection Pool Monitoring (CRITICAL)
Why This Matters
Our RLS architecture requires holding a connection for the entire request duration. Without monitoring, connection pool exhaustion can cause cascading failures with no visibility.
Implementation
File: internal/observability/pool_metrics.go
package observability
import (
"context"
"log/slog"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
type PoolMetrics struct {
pool *pgxpool.Pool
name string // "primary", "replica-1", etc.
ticker *time.Ticker
done chan bool
}
func NewPoolMetrics(pool *pgxpool.Pool, name string) *PoolMetrics {
return &PoolMetrics{
pool: pool,
name: name,
ticker: time.NewTicker(30 * time.Second),
done: make(chan bool),
}
}
// Start begins collecting and logging pool metrics
func (pm *PoolMetrics) Start() {
go func() {
for {
select {
case <-pm.ticker.C:
pm.collectMetrics()
case <-pm.done:
return
}
}
}()
}
// Stop stops the metrics collector
func (pm *PoolMetrics) Stop() {
pm.ticker.Stop()
pm.done <- true
}
func (pm *PoolMetrics) collectMetrics() {
stats := pm.pool.Stat()
totalConns := stats.TotalConns()
acquiredConns := stats.AcquiredConns()
idleConns := stats.IdleConns()
maxConns := stats.MaxConns()
// Calculate utilization percentage
utilizationPct := float64(totalConns) / float64(maxConns) * 100
// Calculate wait count (connections that had to wait)
// Note: This is cumulative, so we track delta
waitCount := stats.AcquireCount() - stats.AcquiredConns()
slog.Info("pool_metrics",
"pool", pm.name,
"total_conns", totalConns,
"acquired_conns", acquiredConns,
"idle_conns", idleConns,
"max_conns", maxConns,
"utilization_pct", utilizationPct,
"wait_count", waitCount,
)
// Alert thresholds
if utilizationPct > 80 {
slog.Warn("connection_pool_high_utilization",
"pool", pm.name,
"utilization_pct", utilizationPct,
"action", "Consider scaling DB replicas or increasing pool size",
)
}
if waitCount > 0 {
slog.Error("connection_pool_waits_detected",
"pool", pm.name,
"wait_count", waitCount,
"action", "Immediate attention required - connections are being delayed",
)
}
if idleConns == 0 && acquiredConns == maxConns {
slog.Error("connection_pool_exhausted",
"pool", pm.name,
"action", "Pool is fully saturated - new requests will fail",
)
}
}
// GetCurrentStats returns current pool statistics (for health checks)
func (pm *PoolMetrics) GetCurrentStats() map[string]interface{} {
stats := pm.pool.Stat()
return map[string]interface{}{
"pool": pm.name,
"total_conns": stats.TotalConns(),
"acquired_conns": stats.AcquiredConns(),
"idle_conns": stats.IdleConns(),
"max_conns": stats.MaxConns(),
"utilization_pct": float64(stats.TotalConns()) / float64(stats.MaxConns()) * 100,
"constructing_conns": stats.ConstructingConns(),
}
}Usage in main.go:
// cmd/api/main.go
func main() {
// ... after creating DB pool ...
// Start connection pool monitoring
poolMetrics := observability.NewPoolMetrics(db, "primary")
poolMetrics.Start()
defer poolMetrics.Stop()
// If using read replicas (Phase 2+)
if len(readReplicas) > 0 {
for i, replica := range readReplicas {
metrics := observability.NewPoolMetrics(replica, fmt.Sprintf("replica-%d", i+1))
metrics.Start()
defer metrics.Stop()
}
}
// ... rest of server setup ...
}Datadog/CloudWatch Integration
File: internal/observability/metrics_exporter.go
package observability
import (
"context"
"fmt"
"time"
"github.com/DataDog/datadog-go/v5/statsd"
"github.com/jackc/pgx/v5/pgxpool"
)
type MetricsExporter struct {
statsd *statsd.Client
pools map[string]*pgxpool.Pool // name -> pool
}
func NewMetricsExporter(statsdHost string, pools map[string]*pgxpool.Pool) (*MetricsExporter, error) {
client, err := statsd.New(statsdHost)
if err != nil {
return nil, fmt.Errorf("create statsd client: %w", err)
}
return &MetricsExporter{
statsd: client,
pools: pools,
}, nil
}
func (me *MetricsExporter) Start(ctx context.Context) {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
me.exportMetrics()
case <-ctx.Done():
return
}
}
}
func (me *MetricsExporter) exportMetrics() {
for name, pool := range me.pools {
stats := pool.Stat()
tags := []string{fmt.Sprintf("pool:%s", name)}
me.statsd.Gauge("postgres.pool.total_conns", float64(stats.TotalConns()), tags, 1)
me.statsd.Gauge("postgres.pool.acquired_conns", float64(stats.AcquiredConns()), tags, 1)
me.statsd.Gauge("postgres.pool.idle_conns", float64(stats.IdleConns()), tags, 1)
me.statsd.Gauge("postgres.pool.max_conns", float64(stats.MaxConns()), tags, 1)
utilizationPct := float64(stats.TotalConns()) / float64(stats.MaxConns()) * 100
me.statsd.Gauge("postgres.pool.utilization_pct", utilizationPct, tags, 1)
// Wait count (number of times Acquire had to wait)
me.statsd.Count("postgres.pool.wait_count",
int64(stats.AcquireCount()-stats.AcquiredConns()), tags, 1)
}
}2. Query Timeout Middleware (CRITICAL)
Why This Matters
Long-running queries hold connections, blocking other requests. A runaway query can exhaust the entire pool.
Implementation
File: internal/middleware/query_timeout.go
package middleware
import (
"context"
"log/slog"
"net/http"
"time"
"restartix-api/internal/pkg/httputil"
)
const (
// Default timeout for all requests
DefaultRequestTimeout = 30 * time.Second
// Timeout for expensive operations (exports, reports)
LongRequestTimeout = 2 * time.Minute
)
// RequestTimeout adds a context timeout to every request
// This ensures queries cannot run indefinitely and hold connections
func RequestTimeout(timeout time.Duration) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), timeout)
defer cancel()
// Channel to capture if handler finished
done := make(chan bool, 1)
go func() {
next.ServeHTTP(w, r.WithContext(ctx))
done <- true
}()
select {
case <-done:
// Request completed successfully
return
case <-ctx.Done():
// Timeout exceeded
if ctx.Err() == context.DeadlineExceeded {
slog.Error("request_timeout",
"path", r.URL.Path,
"method", r.Method,
"timeout", timeout,
"user_id", UserFromContext(r.Context()).ID,
)
httputil.JSON(w, http.StatusGatewayTimeout, map[string]any{
"error": map[string]string{
"code": "request_timeout",
"message": "Request took too long to process. Please try again.",
},
})
}
}
})
}
}
// LongRequestTimeout is a helper for endpoints that need extended timeouts
func LongRequestTimeout(next http.Handler) http.Handler {
return RequestTimeout(LongRequestTimeout)(next)
}Usage in routes.go:
// internal/server/routes.go
func (s *Server) routes() http.Handler {
r := chi.NewRouter()
// Global middleware
r.Use(middleware.RequestID)
r.Use(middleware.Recovery)
r.Use(middleware.SecurityHeaders)
r.Use(middleware.Logging)
r.Use(middleware.AttackBlocker)
r.Use(middleware.RequestTimeout(middleware.DefaultRequestTimeout)) // ← Add this
// ... rest of routes ...
// Expensive operations get longer timeout
r.Route("/v1/export", func(r chi.Router) {
r.Use(middleware.LongRequestTimeout) // 2 minutes instead of 30s
r.Post("/csv", s.exportHandler.CSV)
})
return r
}Query-Level Timeouts
File: internal/database/querier.go
package database
import (
"context"
"time"
"github.com/jackc/pgx/v5"
)
const (
// Max time for any single query
QueryTimeout = 5 * time.Second
// Max time for long-running queries (reports, analytics)
LongQueryTimeout = 30 * time.Second
)
// QueryWithTimeout wraps a query with a timeout
func QueryWithTimeout(ctx context.Context, conn Querier, timeout time.Duration, query string, args ...any) (pgx.Rows, error) {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
return conn.Query(ctx, query, args...)
}
// QueryRowWithTimeout wraps QueryRow with a timeout
func QueryRowWithTimeout(ctx context.Context, conn Querier, timeout time.Duration, query string, args ...any) pgx.Row {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
return conn.QueryRow(ctx, query, args...)
}
// Example usage in repository:
// rows, err := database.QueryWithTimeout(ctx, conn, database.QueryTimeout,
// "SELECT * FROM appointments WHERE organization_id = $1", orgID)3. Query Performance Tracing
Why This Matters
Identify slow queries BEFORE they become problems. Every query > 500ms should be investigated.
Implementation
File: internal/middleware/query_tracer.go
package middleware
import (
"context"
"log/slog"
"net/http"
"time"
"github.com/jackc/pgx/v5"
"restartix-api/internal/database"
)
// QueryPerformanceTracer logs slow queries for investigation
func QueryPerformanceTracer(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
conn := database.ConnFromContext(r.Context())
if conn != nil {
requestID := RequestIDFromContext(r.Context())
tracer := &queryTracer{
requestID: requestID,
path: r.URL.Path,
method: r.Method,
}
// Attach tracer to this connection
oldTracer := conn.Config().ConnConfig.Tracer
conn.Config().ConnConfig.Tracer = tracer
defer func() {
conn.Config().ConnConfig.Tracer = oldTracer
}()
}
next.ServeHTTP(w, r)
})
}
type queryTracer struct {
requestID string
path string
method string
}
func (qt *queryTracer) TraceQueryStart(ctx context.Context, conn *pgx.Conn, data pgx.TraceQueryStartData) context.Context {
return context.WithValue(ctx, "query_start", time.Now())
}
func (qt *queryTracer) TraceQueryEnd(ctx context.Context, conn *pgx.Conn, data pgx.TraceQueryEndData) {
start, ok := ctx.Value("query_start").(time.Time)
if !ok {
return
}
duration := time.Since(start)
// Log slow queries
if duration > 500*time.Millisecond {
// Truncate long queries for logging
sqlPreview := data.SQL
if len(sqlPreview) > 200 {
sqlPreview = sqlPreview[:200] + "..."
}
slog.Warn("slow_query",
"request_id", qt.requestID,
"path", qt.path,
"method", qt.method,
"duration_ms", duration.Milliseconds(),
"sql", sqlPreview,
"rows_affected", data.CommandTag.RowsAffected(),
)
// If query took > 5s, this is critical
if duration > 5*time.Second {
slog.Error("critical_slow_query",
"request_id", qt.requestID,
"duration_ms", duration.Milliseconds(),
"action", "Investigate immediately - blocking connection for too long",
)
}
}
}Add to routes:
// internal/server/routes.go
func (s *Server) routes() http.Handler {
r := chi.NewRouter()
r.Use(middleware.RequestID)
r.Use(middleware.Recovery)
r.Use(middleware.SecurityHeaders)
r.Use(middleware.Logging)
r.Use(middleware.AttackBlocker)
r.Use(middleware.RequestTimeout(middleware.DefaultRequestTimeout))
r.Use(middleware.QueryPerformanceTracer) // ← Add this
// ... routes ...
}4. Health Check with Pool Metrics
Why This Matters
App Runner health checks need to know if the connection pool is healthy, not just if the HTTP server responds.
Implementation
File: internal/health/handler.go
package health
import (
"context"
"encoding/json"
"log/slog"
"net/http"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/redis/go-redis/v9"
)
type Handler struct {
db *pgxpool.Pool
redis *redis.Client
}
func NewHandler(db *pgxpool.Pool, redis *redis.Client) *Handler {
return &Handler{db: db, redis: redis}
}
type HealthResponse struct {
Status string `json:"status"` // "healthy", "degraded", "unhealthy"
Checks map[string]HealthCheck `json:"checks"`
Uptime int64 `json:"uptime_seconds"`
}
type HealthCheck struct {
Status string `json:"status"`
Message string `json:"message,omitempty"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
}
var startTime = time.Now()
func (h *Handler) Check(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
response := HealthResponse{
Status: "healthy",
Checks: make(map[string]HealthCheck),
Uptime: int64(time.Since(startTime).Seconds()),
}
// Check PostgreSQL
dbCheck := h.checkPostgreSQL(ctx)
response.Checks["postgresql"] = dbCheck
if dbCheck.Status != "healthy" {
response.Status = "degraded"
}
// Check Redis
redisCheck := h.checkRedis(ctx)
response.Checks["redis"] = redisCheck
if redisCheck.Status != "healthy" {
response.Status = "degraded"
}
// Overall status code
statusCode := http.StatusOK
if response.Status == "degraded" {
statusCode = http.StatusServiceUnavailable
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
json.NewEncoder(w).Encode(response)
}
func (h *Handler) checkPostgreSQL(ctx context.Context) HealthCheck {
// 1. Test connectivity
if err := h.db.Ping(ctx); err != nil {
slog.Error("health_check_db_ping_failed", "error", err)
return HealthCheck{
Status: "unhealthy",
Message: "Database ping failed",
}
}
// 2. Check pool metrics
stats := h.db.Stat()
totalConns := stats.TotalConns()
maxConns := stats.MaxConns()
utilizationPct := float64(totalConns) / float64(maxConns) * 100
metrics := map[string]interface{}{
"total_conns": totalConns,
"acquired_conns": stats.AcquiredConns(),
"idle_conns": stats.IdleConns(),
"max_conns": maxConns,
"utilization_pct": utilizationPct,
}
// 3. Determine health based on utilization
if utilizationPct > 90 {
return HealthCheck{
Status: "unhealthy",
Message: "Connection pool near exhaustion",
Metrics: metrics,
}
}
if utilizationPct > 80 {
return HealthCheck{
Status: "degraded",
Message: "Connection pool utilization high",
Metrics: metrics,
}
}
return HealthCheck{
Status: "healthy",
Metrics: metrics,
}
}
func (h *Handler) checkRedis(ctx context.Context) HealthCheck {
if err := h.redis.Ping(ctx).Err(); err != nil {
slog.Error("health_check_redis_ping_failed", "error", err)
return HealthCheck{
Status: "unhealthy",
Message: "Redis ping failed",
}
}
// Get Redis memory usage
info, err := h.redis.Info(ctx, "memory").Result()
if err != nil {
return HealthCheck{
Status: "healthy", // Degraded but not critical
}
}
return HealthCheck{
Status: "healthy",
Metrics: map[string]interface{}{
"info": info,
},
}
}App Runner Health Check Configuration:
# App Runner service configuration
HealthCheckConfiguration:
Path: /health
Interval: 10
Timeout: 5
HealthyThreshold: 1
UnhealthyThreshold: 35. HIPAA/GDPR Compliance Fixes
5.1 Key Rotation Automation
File: docs/reference/key-rotation.md
# Encryption Key Rotation Procedure
## Schedule
**Frequency:** Quarterly (every 90 days)
**Calendar:** Set recurring reminders on:
- February 1 (Q1)
- May 1 (Q2)
- August 1 (Q3)
- November 1 (Q4)
## Automated Reminder System
### Option A: GitHub Actions (Recommended)
**.github/workflows/key-rotation-reminder.yml**
\`\`\`yaml
name: Key Rotation Reminder
on:
schedule:
# First day of every quarter at 9am UTC
- cron: '0 9 1 2,5,8,11 *'
jobs:
reminder:
runs-on: ubuntu-latest
steps:
- name: Send Slack Notification
uses: slackapi/[email protected]
with:
payload: |
{
"text": "🔐 QUARTERLY KEY ROTATION DUE",
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "🔐 Encryption Key Rotation Required"
}
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*Action Required:* Rotate encryption keys according to HIPAA compliance schedule.\n\n*Procedure:* See docs/reference/key-rotation.md"
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": "*Deadline:* Within 7 days"
},
{
"type": "mrkdwn",
"text": "*Assignee:* @security-team"
}
]
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SECURITY_WEBHOOK }}
\`\`\`
### Option B: Calendar Automation
**Google Calendar / Outlook:**
1. Create recurring event "Encryption Key Rotation" on first day of Q1/Q2/Q3/Q4
2. Set reminders:
- 7 days before
- 3 days before
- Day of
3. Invite: security team, DevOps lead, CTO
## Step-by-Step Rotation
### Pre-Rotation Checklist
- [ ] Verify backup systems are functioning
- [ ] Identify all tables with encrypted columns:
- `patient_persons.phone_encrypted`
- `patient_persons.emergency_contact_phone_encrypted`
- `organization_integrations.api_key_encrypted`
- [ ] Estimate re-encryption time (500 rows/sec baseline)
- [ ] Schedule maintenance window if needed (typically not required)
### Rotation Steps
**1. Generate New Key**
\`\`\`bash
# Generate 32-byte (256-bit) key
openssl rand -hex 32 > new-encryption-key.txt
# Securely store in password manager
# DO NOT commit to git
\`\`\`
**2. Add New Key to Config**
\`\`\`bash
# AWS Secrets Manager (via CLI or console)
aws secretsmanager update-secret --secret-id restartix/prod --secret-string '{"ENCRYPTION_KEY_V2":"'$(cat new-encryption-key.txt)'","ENCRYPTION_CURRENT_VERSION":"2"}'
# Keep old key for decryption
# ENCRYPTION_KEY_V1 remains unchanged in the secret
\`\`\`
**3. Deploy with Multi-Version Support**
\`\`\`bash
# Deploy Core API with both keys available (push to main triggers GitHub Actions → ECR → App Runner auto-deploy)
git push origin main
# Verify deployment
curl https://api.restartix.com/health
\`\`\```
**4. Run Re-Encryption Job**
\`\`\`bash
# Run as a one-off task (e.g., via ECS RunTask or locally with production credentials)
go run cmd/tools/reencrypt/main.go
# Monitor progress
tail -f reencrypt.log
\`\`\`
**5. Verify Re-Encryption**
\`\`\`sql
-- Check patient_persons table (phone)
SELECT COUNT(*) AS old_key_count
FROM patient_persons
WHERE phone_encrypted IS NOT NULL
AND get_byte(phone_encrypted, 0) != 2; -- Key version 2
-- Should return 0
-- Check patient_persons table (emergency contact phone)
SELECT COUNT(*) AS old_key_count
FROM patient_persons
WHERE emergency_contact_phone_encrypted IS NOT NULL
AND get_byte(emergency_contact_phone_encrypted, 0) != 2;
-- Should return 0
-- Check organization_integrations table
SELECT COUNT(*) AS old_key_count
FROM organization_integrations
WHERE get_byte(api_key_encrypted, 0) != 2;
-- Should return 0
\`\`\`
**6. Remove Old Key**
\`\`\`bash
# Wait 24 hours to ensure no issues
# Then remove old key from AWS Secrets Manager
aws secretsmanager update-secret --secret-id restartix/prod --secret-string '{"ENCRYPTION_KEY_V1": null}'
# Redeploy (push to main triggers GitHub Actions → ECR → App Runner auto-deploy)
git push origin main
\`\`\`
**7. Audit Log Entry**
\`\`\`sql
INSERT INTO audit_log (
organization_id, user_id, action, entity_type, entity_id,
changes, created_at, action_context
) VALUES (
NULL, -- System-level action
(SELECT id FROM users WHERE role = 'superadmin' LIMIT 1),
'KEY_ROTATION',
'encryption_key',
2, -- New key version
'{"old_version": 1, "new_version": 2, "rotated_at": "2026-02-13T10:00:00Z"}'::jsonb,
NOW(),
'compliance_maintenance'
);
\`\`\`
### Post-Rotation
- [ ] Verify all encrypted data readable
- [ ] Update compliance documentation
- [ ] Update BAA documentation (if key rotation affects it)
- [ ] Securely destroy old key (after 30 days)
- [ ] Schedule next rotation (90 days from now)
## Emergency Key Rotation
**Trigger:** Suspected key compromise
**Procedure:**
1. Follow rotation steps above IMMEDIATELY
2. Notify compliance officer
3. Document incident in `break_glass_log` table
4. Review all audit logs for unauthorized access
5. File security incident report5.2 De-Identification Enforcement
File: internal/jobs/validate_test_data.go
package jobs
import (
"context"
"fmt"
"log/slog"
"regexp"
"github.com/jackc/pgx/v5/pgxpool"
)
// ValidateTestData ensures no production PHI exists in non-prod environments
// Run this in CI/CD before deploying to staging
func ValidateTestData(ctx context.Context, db *pgxpool.Pool, env string) error {
if env == "production" {
return nil // Skip in production
}
slog.Info("validating_test_data", "env", env)
violations := make([]string, 0)
// 1. Check for real email domains
realEmails, err := db.Query(ctx, `
SELECT email FROM users
WHERE email NOT LIKE '%@example.com'
AND email NOT LIKE '%@test.com'
AND email NOT LIKE '%@restartix.test'
LIMIT 10
`)
if err != nil {
return fmt.Errorf("check emails: %w", err)
}
defer realEmails.Close()
for realEmails.Next() {
var email string
realEmails.Scan(&email)
violations = append(violations, fmt.Sprintf("Real email found: %s", email))
}
// 2. Check for real phone numbers (not in test ranges)
// Romanian test range: +40700000XXX
realPhones, err := db.Query(ctx, `
SELECT name FROM patients
WHERE phone_encrypted IS NOT NULL
LIMIT 5 -- Just check a sample
`)
if err != nil {
return fmt.Errorf("check phones: %w", err)
}
defer realPhones.Close()
// Note: We can't decrypt in this check without the encryption key
// but we can verify the test data generation script was run
var patientCount int
db.QueryRow(ctx, "SELECT COUNT(*) FROM patients").Scan(&patientCount)
if patientCount == 0 {
violations = append(violations, "No test patients found - database may contain production data")
}
// 3. Check organization names
realOrgs, err := db.Query(ctx, `
SELECT name FROM organizations
WHERE name NOT LIKE '%Test%'
AND name NOT LIKE '%Demo%'
AND name NOT LIKE '%Example%'
LIMIT 10
`)
if err != nil {
return fmt.Errorf("check orgs: %w", err)
}
defer realOrgs.Close()
for realOrgs.Next() {
var name string
realOrgs.Scan(&name)
// Allow "Acme Clinic" style test names
if !regexp.MustCompile(`(?i)(acme|widgets|corp|inc test|sample)`).MatchString(name) {
violations = append(violations, fmt.Sprintf("Suspicious org name: %s", name))
}
}
// 4. Fail if violations found
if len(violations) > 0 {
slog.Error("test_data_validation_failed",
"env", env,
"violations", violations,
)
return fmt.Errorf("test data validation failed: %d violations", len(violations))
}
slog.Info("test_data_validation_passed", "env", env)
return nil
}CI/CD Integration:
# .github/workflows/deploy-staging.yml
name: Deploy to Staging
on:
push:
branches: [develop]
jobs:
validate-test-data:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: '1.23'
- name: Validate Test Data
env:
DATABASE_URL: ${{ secrets.STAGING_DATABASE_URL }}
run: |
go run cmd/tools/validate-test-data/main.go
- name: Fail if PHI detected
if: failure()
run: |
echo "❌ PRODUCTION PHI DETECTED IN STAGING DATABASE"
echo "This is a HIPAA violation. Deployment blocked."
exit 1
deploy:
needs: validate-test-data
runs-on: ubuntu-latest
steps:
# ... deployment steps ...5.3 Webhook Payload PHI Audit
File: internal/webhook/payload_validator.go
package webhook
import (
"encoding/json"
"fmt"
"log/slog"
"regexp"
"strings"
)
// PHIPatterns are regex patterns that might indicate PHI in webhook payloads
var PHIPatterns = []*regexp.Regexp{
// Email addresses (should not be in webhooks)
regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`),
// Phone numbers (various formats)
regexp.MustCompile(`\+?[0-9]{10,15}`),
regexp.MustCompile(`\d{3}[-.\s]?\d{3}[-.\s]?\d{4}`),
// Potential names (capitalized words, 2+ words)
// This is a heuristic and may have false positives
regexp.MustCompile(`\b[A-Z][a-z]+ [A-Z][a-z]+\b`),
// Medical record numbers (patterns like MRN-12345)
regexp.MustCompile(`(?i)(mrn|patient[_-]?id|record[_-]?number)[:\s]*[0-9]+`),
}
// SensitiveFields are field names that should never appear in webhook payloads
var SensitiveFields = []string{
"email", "phone", "phone_encrypted", "name", "address",
"ssn", "birth_date", "medical_record_number",
"diagnosis", "prescription", "treatment",
}
// ValidatePayload checks if a webhook payload contains PHI
// Returns error if PHI detected
func ValidatePayload(payload map[string]interface{}) error {
violations := make([]string, 0)
// Convert to JSON string for pattern matching
jsonBytes, _ := json.Marshal(payload)
jsonStr := string(jsonBytes)
// Check for PHI patterns
for _, pattern := range PHIPatterns {
if matches := pattern.FindAllString(jsonStr, -1); len(matches) > 0 {
violations = append(violations, fmt.Sprintf("Pattern match: %v", matches))
}
}
// Check for sensitive field names
violations = append(violations, checkSensitiveFields(payload, "")...)
if len(violations) > 0 {
slog.Error("webhook_phi_violation",
"violations", violations,
"payload_preview", jsonStr[:min(200, len(jsonStr))],
)
return fmt.Errorf("PHI detected in webhook payload: %d violations", len(violations))
}
return nil
}
// checkSensitiveFields recursively checks for sensitive field names
func checkSensitiveFields(data interface{}, path string) []string {
violations := make([]string, 0)
switch v := data.(type) {
case map[string]interface{}:
for key, value := range v {
fieldPath := key
if path != "" {
fieldPath = path + "." + key
}
// Check if field name is sensitive
for _, sensitive := range SensitiveFields {
if strings.Contains(strings.ToLower(key), sensitive) {
violations = append(violations, fmt.Sprintf("Sensitive field: %s", fieldPath))
}
}
// Recurse into nested objects
violations = append(violations, checkSensitiveFields(value, fieldPath)...)
}
case []interface{}:
for i, item := range v {
violations = append(violations, checkSensitiveFields(item, fmt.Sprintf("%s[%d]", path, i))...)
}
}
return violations
}
// ApprovedWebhookPayload is a safe payload structure (whitelist approach)
type ApprovedWebhookPayload struct {
Event string `json:"event"` // e.g., "appointment.created"
EntityType string `json:"entity_type"` // e.g., "appointment"
EntityID int64 `json:"entity_id"` // Database ID only
EntityUID string `json:"entity_uid"` // UUID only
OrganizationID int64 `json:"organization_id"`
Timestamp string `json:"timestamp"` // ISO 8601
Metadata map[string]interface{} `json:"metadata,omitempty"` // Safe metadata only
}
// ToApprovedPayload converts an internal event to a safe webhook payload
func ToApprovedPayload(event *Event) (*ApprovedWebhookPayload, error) {
payload := &ApprovedWebhookPayload{
Event: event.Type,
EntityType: event.EntityType,
EntityID: event.EntityID,
EntityUID: event.EntityUID,
OrganizationID: event.OrganizationID,
Timestamp: event.Timestamp.Format(time.RFC3339),
}
// Only include safe metadata (IDs, statuses, timestamps - NO PHI)
safeMetadata := make(map[string]interface{})
if event.EntityType == "appointment" {
safeMetadata["status"] = event.Metadata["status"]
safeMetadata["started_at"] = event.Metadata["started_at"]
}
payload.Metadata = safeMetadata
// Validate before sending
payloadMap := make(map[string]interface{})
bytes, _ := json.Marshal(payload)
json.Unmarshal(bytes, &payloadMap)
if err := ValidatePayload(payloadMap); err != nil {
return nil, err
}
return payload, nil
}Automated Testing:
// internal/webhook/payload_validator_test.go
package webhook
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestValidatePayload_DetectsPHI(t *testing.T) {
tests := []struct {
name string
payload map[string]interface{}
expectError bool
}{
{
name: "safe payload with IDs only",
payload: map[string]interface{}{
"event": "appointment.created",
"entity_id": 123,
"entity_uid": "550e8400-e29b-41d4-a716-446655440000",
"organization_id": 1,
},
expectError: false,
},
{
name: "unsafe payload with email",
payload: map[string]interface{}{
"event": "appointment.created",
"patient_email": "[email protected]",
},
expectError: true,
},
{
name: "unsafe payload with phone",
payload: map[string]interface{}{
"event": "appointment.created",
"phone": "+40700000123",
},
expectError: true,
},
{
name: "unsafe payload with name",
payload: map[string]interface{}{
"event": "appointment.created",
"patient_name": "John Doe",
},
expectError: true,
},
{
name: "unsafe payload with nested PHI",
payload: map[string]interface{}{
"event": "appointment.created",
"data": map[string]interface{}{
"patient": map[string]interface{}{
"email": "[email protected]",
},
},
},
expectError: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidatePayload(tt.payload)
if tt.expectError {
assert.Error(t, err, "Expected PHI violation to be detected")
} else {
assert.NoError(t, err, "Expected payload to be safe")
}
})
}
}CI/CD Integration:
# .github/workflows/test.yml
- name: Run Webhook PHI Audit Tests
run: go test ./internal/webhook -v -run TestValidatePayload
# Fail deployment if tests don't pass
- name: Check for webhook PHI violations
if: failure()
run: |
echo "❌ Webhook payload validation failed"
echo "Review webhook payloads for PHI leakage"
exit 16. Documentation Updates
Update 04-auth-and-security.md
Add this section at line 493 (after BAA checklist):
### Key Rotation Enforcement
**Automated Reminders:** GitHub Actions workflow sends quarterly reminders to security team
**Procedure:** See [Key Rotation Procedure](./key-rotation.md)
**Audit Trail:** All rotations logged in `audit_log` table with `action_context = 'compliance_maintenance'`
**Last rotation:** (tracked in audit_log)
**Next rotation due:** (automated reminder system)
### Test Data Enforcement
**Validation:** Automated CI/CD checks prevent production PHI in staging
**Procedure:** See `internal/jobs/validate_test_data.go`
**Enforcement:** Deployment blocked if real email domains or PHI patterns detected
### Webhook PHI Audit
**Validation:** All webhook payloads validated before sending
**Implementation:** See `internal/webhook/payload_validator.go`
**Testing:** Automated tests ensure no PHI in webhook payloads
**Whitelist Approach:** Only approved fields (IDs, UIDs, timestamps, statuses) allowedImplementation Checklist
Week 1: Critical Monitoring
- [ ] Implement
internal/observability/pool_metrics.go - [ ] Implement
internal/middleware/query_timeout.go - [ ] Implement
internal/middleware/query_tracer.go - [ ] Update
internal/health/handler.gowith pool metrics - [ ] Add metrics to
cmd/api/main.go - [ ] Deploy to staging and verify logging
- [ ] Set up Datadog/CloudWatch dashboards
Week 2: HIPAA/GDPR Compliance
- [ ] Create
docs/reference/key-rotation.md - [ ] Set up GitHub Actions key rotation reminder
- [ ] Implement
internal/jobs/validate_test_data.go - [ ] Implement
internal/webhook/payload_validator.go - [ ] Add webhook payload tests
- [ ] Update CI/CD with validation steps
- [ ] Test key rotation procedure in staging
Week 3: Documentation & Training
- [ ] Update
docs/04-auth-and-security.mdwith new sections - [ ] Document Datadog alert thresholds
- [ ] Create runbook for connection pool issues
- [ ] Train team on monitoring dashboards
- [ ] Schedule first key rotation (add to calendar)
Success Metrics
| Metric | Target | Measurement |
|---|---|---|
| Connection pool monitoring | 100% uptime | Datadog dashboard |
| Slow query detection | < 1% queries > 500ms | Query tracer logs |
| Key rotation compliance | 100% on-time | Audit log entries |
| Test data validation | 0 PHI violations | CI/CD pass rate |
| Webhook PHI audit | 0 PHI in payloads | Automated tests |
Next Steps
After completing these immediate actions:
- Phase 2 Planning: Begin planning read replica migration (when > 10 orgs)
- Audit Log Partitioning: Implement monthly partitions (when > 1M audit log rows)
- Enterprise Provisioning: Build automated enterprise org provisioning (when first enterprise customer signs)