Immediate Actions: Production Readiness

This document covers the immediate implementation tasks to address scaling concerns and close HIPAA/GDPR compliance gaps identified in the architecture review.

1. Connection Pool Monitoring (CRITICAL)

Why This Matters

Our RLS architecture requires holding a connection for the entire request duration. Without monitoring, connection pool exhaustion can cause cascading failures with no visibility.

Implementation

File: internal/observability/pool_metrics.go

package observability

import (
    "context"
    "log/slog"
    "time"

    "github.com/jackc/pgx/v5/pgxpool"
)

type PoolMetrics struct {
    pool   *pgxpool.Pool
    name   string  // "primary", "replica-1", etc.
    ticker *time.Ticker
    done   chan bool
}

func NewPoolMetrics(pool *pgxpool.Pool, name string) *PoolMetrics {
    return &PoolMetrics{
        pool:   pool,
        name:   name,
        ticker: time.NewTicker(30 * time.Second),
        done:   make(chan bool),
    }
}

// Start begins collecting and logging pool metrics
func (pm *PoolMetrics) Start() {
    go func() {
        for {
            select {
            case <-pm.ticker.C:
                pm.collectMetrics()
            case <-pm.done:
                return
            }
        }
    }()
}

// Stop stops the metrics collector
func (pm *PoolMetrics) Stop() {
    pm.ticker.Stop()
    pm.done <- true
}

func (pm *PoolMetrics) collectMetrics() {
    stats := pm.pool.Stat()

    totalConns := stats.TotalConns()
    acquiredConns := stats.AcquiredConns()
    idleConns := stats.IdleConns()
    maxConns := stats.MaxConns()

    // Calculate utilization percentage
    utilizationPct := float64(totalConns) / float64(maxConns) * 100

    // Calculate wait count (connections that had to wait)
    // Note: This is cumulative, so we track delta
    waitCount := stats.AcquireCount() - stats.AcquiredConns()

    slog.Info("pool_metrics",
        "pool", pm.name,
        "total_conns", totalConns,
        "acquired_conns", acquiredConns,
        "idle_conns", idleConns,
        "max_conns", maxConns,
        "utilization_pct", utilizationPct,
        "wait_count", waitCount,
    )

    // Alert thresholds
    if utilizationPct > 80 {
        slog.Warn("connection_pool_high_utilization",
            "pool", pm.name,
            "utilization_pct", utilizationPct,
            "action", "Consider scaling DB replicas or increasing pool size",
        )
    }

    if waitCount > 0 {
        slog.Error("connection_pool_waits_detected",
            "pool", pm.name,
            "wait_count", waitCount,
            "action", "Immediate attention required - connections are being delayed",
        )
    }

    if idleConns == 0 && acquiredConns == maxConns {
        slog.Error("connection_pool_exhausted",
            "pool", pm.name,
            "action", "Pool is fully saturated - new requests will fail",
        )
    }
}

// GetCurrentStats returns current pool statistics (for health checks)
func (pm *PoolMetrics) GetCurrentStats() map[string]interface{} {
    stats := pm.pool.Stat()

    return map[string]interface{}{
        "pool":              pm.name,
        "total_conns":       stats.TotalConns(),
        "acquired_conns":    stats.AcquiredConns(),
        "idle_conns":        stats.IdleConns(),
        "max_conns":         stats.MaxConns(),
        "utilization_pct":   float64(stats.TotalConns()) / float64(stats.MaxConns()) * 100,
        "constructing_conns": stats.ConstructingConns(),
    }
}

Usage in main.go:

// cmd/api/main.go

func main() {
    // ... after creating DB pool ...

    // Start connection pool monitoring
    poolMetrics := observability.NewPoolMetrics(db, "primary")
    poolMetrics.Start()
    defer poolMetrics.Stop()

    // If using read replicas (Phase 2+)
    if len(readReplicas) > 0 {
        for i, replica := range readReplicas {
            metrics := observability.NewPoolMetrics(replica, fmt.Sprintf("replica-%d", i+1))
            metrics.Start()
            defer metrics.Stop()
        }
    }

    // ... rest of server setup ...
}

Datadog/CloudWatch Integration

File: internal/observability/metrics_exporter.go

package observability

import (
    "context"
    "fmt"
    "time"

    "github.com/DataDog/datadog-go/v5/statsd"
    "github.com/jackc/pgx/v5/pgxpool"
)

type MetricsExporter struct {
    statsd *statsd.Client
    pools  map[string]*pgxpool.Pool  // name -> pool
}

func NewMetricsExporter(statsdHost string, pools map[string]*pgxpool.Pool) (*MetricsExporter, error) {
    client, err := statsd.New(statsdHost)
    if err != nil {
        return nil, fmt.Errorf("create statsd client: %w", err)
    }

    return &MetricsExporter{
        statsd: client,
        pools:  pools,
    }, nil
}

func (me *MetricsExporter) Start(ctx context.Context) {
    ticker := time.NewTicker(10 * time.Second)
    defer ticker.Stop()

    for {
        select {
        case <-ticker.C:
            me.exportMetrics()
        case <-ctx.Done():
            return
        }
    }
}

func (me *MetricsExporter) exportMetrics() {
    for name, pool := range me.pools {
        stats := pool.Stat()

        tags := []string{fmt.Sprintf("pool:%s", name)}

        me.statsd.Gauge("postgres.pool.total_conns", float64(stats.TotalConns()), tags, 1)
        me.statsd.Gauge("postgres.pool.acquired_conns", float64(stats.AcquiredConns()), tags, 1)
        me.statsd.Gauge("postgres.pool.idle_conns", float64(stats.IdleConns()), tags, 1)
        me.statsd.Gauge("postgres.pool.max_conns", float64(stats.MaxConns()), tags, 1)

        utilizationPct := float64(stats.TotalConns()) / float64(stats.MaxConns()) * 100
        me.statsd.Gauge("postgres.pool.utilization_pct", utilizationPct, tags, 1)

        // Wait count (number of times Acquire had to wait)
        me.statsd.Count("postgres.pool.wait_count",
            int64(stats.AcquireCount()-stats.AcquiredConns()), tags, 1)
    }
}

2. Query Timeout Middleware (CRITICAL)

Why This Matters

Long-running queries hold connections, blocking other requests. A runaway query can exhaust the entire pool.

Implementation

File: internal/middleware/query_timeout.go

package middleware

import (
    "context"
    "log/slog"
    "net/http"
    "time"

    "restartix-api/internal/pkg/httputil"
)

const (
    // Default timeout for all requests
    DefaultRequestTimeout = 30 * time.Second

    // Timeout for expensive operations (exports, reports)
    LongRequestTimeout = 2 * time.Minute
)

// RequestTimeout adds a context timeout to every request
// This ensures queries cannot run indefinitely and hold connections
func RequestTimeout(timeout time.Duration) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            ctx, cancel := context.WithTimeout(r.Context(), timeout)
            defer cancel()

            // Channel to capture if handler finished
            done := make(chan bool, 1)

            go func() {
                next.ServeHTTP(w, r.WithContext(ctx))
                done <- true
            }()

            select {
            case <-done:
                // Request completed successfully
                return

            case <-ctx.Done():
                // Timeout exceeded
                if ctx.Err() == context.DeadlineExceeded {
                    slog.Error("request_timeout",
                        "path", r.URL.Path,
                        "method", r.Method,
                        "timeout", timeout,
                        "user_id", UserFromContext(r.Context()).ID,
                    )

                    httputil.JSON(w, http.StatusGatewayTimeout, map[string]any{
                        "error": map[string]string{
                            "code":    "request_timeout",
                            "message": "Request took too long to process. Please try again.",
                        },
                    })
                }
            }
        })
    }
}

// LongRequestTimeout is a helper for endpoints that need extended timeouts
func LongRequestTimeout(next http.Handler) http.Handler {
    return RequestTimeout(LongRequestTimeout)(next)
}

Usage in routes.go:

// internal/server/routes.go

func (s *Server) routes() http.Handler {
    r := chi.NewRouter()

    // Global middleware
    r.Use(middleware.RequestID)
    r.Use(middleware.Recovery)
    r.Use(middleware.SecurityHeaders)
    r.Use(middleware.Logging)
    r.Use(middleware.AttackBlocker)
    r.Use(middleware.RequestTimeout(middleware.DefaultRequestTimeout))  // ← Add this

    // ... rest of routes ...

    // Expensive operations get longer timeout
    r.Route("/v1/export", func(r chi.Router) {
        r.Use(middleware.LongRequestTimeout)  // 2 minutes instead of 30s
        r.Post("/csv", s.exportHandler.CSV)
    })

    return r
}

Query-Level Timeouts

File: internal/database/querier.go

package database

import (
    "context"
    "time"

    "github.com/jackc/pgx/v5"
)

const (
    // Max time for any single query
    QueryTimeout = 5 * time.Second

    // Max time for long-running queries (reports, analytics)
    LongQueryTimeout = 30 * time.Second
)

// QueryWithTimeout wraps a query with a timeout
func QueryWithTimeout(ctx context.Context, conn Querier, timeout time.Duration, query string, args ...any) (pgx.Rows, error) {
    ctx, cancel := context.WithTimeout(ctx, timeout)
    defer cancel()

    return conn.Query(ctx, query, args...)
}

// QueryRowWithTimeout wraps QueryRow with a timeout
func QueryRowWithTimeout(ctx context.Context, conn Querier, timeout time.Duration, query string, args ...any) pgx.Row {
    ctx, cancel := context.WithTimeout(ctx, timeout)
    defer cancel()

    return conn.QueryRow(ctx, query, args...)
}

// Example usage in repository:
// rows, err := database.QueryWithTimeout(ctx, conn, database.QueryTimeout,
//     "SELECT * FROM appointments WHERE organization_id = $1", orgID)

3. Query Performance Tracing

Why This Matters

Identify slow queries BEFORE they become problems. Every query > 500ms should be investigated.

Implementation

File: internal/middleware/query_tracer.go

package middleware

import (
    "context"
    "log/slog"
    "net/http"
    "time"

    "github.com/jackc/pgx/v5"
    "restartix-api/internal/database"
)

// QueryPerformanceTracer logs slow queries for investigation
func QueryPerformanceTracer(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        conn := database.TxFromContext(r.Context())
        if conn != nil {
            requestID := RequestIDFromContext(r.Context())
            tracer := &queryTracer{
                requestID: requestID,
                path:      r.URL.Path,
                method:    r.Method,
            }

            // Attach tracer to this connection
            oldTracer := conn.Config().ConnConfig.Tracer
            conn.Config().ConnConfig.Tracer = tracer
            defer func() {
                conn.Config().ConnConfig.Tracer = oldTracer
            }()
        }

        next.ServeHTTP(w, r)
    })
}

type queryTracer struct {
    requestID string
    path      string
    method    string
}

func (qt *queryTracer) TraceQueryStart(ctx context.Context, conn *pgx.Conn, data pgx.TraceQueryStartData) context.Context {
    return context.WithValue(ctx, "query_start", time.Now())
}

func (qt *queryTracer) TraceQueryEnd(ctx context.Context, conn *pgx.Conn, data pgx.TraceQueryEndData) {
    start, ok := ctx.Value("query_start").(time.Time)
    if !ok {
        return
    }

    duration := time.Since(start)

    // Log slow queries
    if duration > 500*time.Millisecond {
        // Truncate long queries for logging
        sqlPreview := data.SQL
        if len(sqlPreview) > 200 {
            sqlPreview = sqlPreview[:200] + "..."
        }

        slog.Warn("slow_query",
            "request_id", qt.requestID,
            "path", qt.path,
            "method", qt.method,
            "duration_ms", duration.Milliseconds(),
            "sql", sqlPreview,
            "rows_affected", data.CommandTag.RowsAffected(),
        )

        // If query took > 5s, this is critical
        if duration > 5*time.Second {
            slog.Error("critical_slow_query",
                "request_id", qt.requestID,
                "duration_ms", duration.Milliseconds(),
                "action", "Investigate immediately - blocking connection for too long",
            )
        }
    }
}

Add to routes:

// internal/server/routes.go

func (s *Server) routes() http.Handler {
    r := chi.NewRouter()

    r.Use(middleware.RequestID)
    r.Use(middleware.Recovery)
    r.Use(middleware.SecurityHeaders)
    r.Use(middleware.Logging)
    r.Use(middleware.AttackBlocker)
    r.Use(middleware.RequestTimeout(middleware.DefaultRequestTimeout))
    r.Use(middleware.QueryPerformanceTracer)  // ← Add this

    // ... routes ...
}

4. Health Check with Pool Metrics

Why This Matters

App Runner health checks need to know if the connection pool is healthy, not just if the HTTP server responds.

Implementation

File: internal/health/handler.go

package health

import (
    "context"
    "encoding/json"
    "log/slog"
    "net/http"
    "time"

    "github.com/jackc/pgx/v5/pgxpool"
    "github.com/redis/go-redis/v9"
)

type Handler struct {
    db    *pgxpool.Pool
    redis *redis.Client
}

func NewHandler(db *pgxpool.Pool, redis *redis.Client) *Handler {
    return &Handler{db: db, redis: redis}
}

type HealthResponse struct {
    Status   string                 `json:"status"`  // "healthy", "degraded", "unhealthy"
    Checks   map[string]HealthCheck `json:"checks"`
    Uptime   int64                  `json:"uptime_seconds"`
}

type HealthCheck struct {
    Status  string                 `json:"status"`
    Message string                 `json:"message,omitempty"`
    Metrics map[string]interface{} `json:"metrics,omitempty"`
}

var startTime = time.Now()

func (h *Handler) Check(w http.ResponseWriter, r *http.Request) {
    ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
    defer cancel()

    response := HealthResponse{
        Status: "healthy",
        Checks: make(map[string]HealthCheck),
        Uptime: int64(time.Since(startTime).Seconds()),
    }

    // Check PostgreSQL
    dbCheck := h.checkPostgreSQL(ctx)
    response.Checks["postgresql"] = dbCheck
    if dbCheck.Status != "healthy" {
        response.Status = "degraded"
    }

    // Check Redis
    redisCheck := h.checkRedis(ctx)
    response.Checks["redis"] = redisCheck
    if redisCheck.Status != "healthy" {
        response.Status = "degraded"
    }

    // Overall status code
    statusCode := http.StatusOK
    if response.Status == "degraded" {
        statusCode = http.StatusServiceUnavailable
    }

    w.Header().Set("Content-Type", "application/json")
    w.WriteHeader(statusCode)
    json.NewEncoder(w).Encode(response)
}

func (h *Handler) checkPostgreSQL(ctx context.Context) HealthCheck {
    // 1. Test connectivity
    if err := h.db.Ping(ctx); err != nil {
        slog.Error("health_check_db_ping_failed", "error", err)
        return HealthCheck{
            Status:  "unhealthy",
            Message: "Database ping failed",
        }
    }

    // 2. Check pool metrics
    stats := h.db.Stat()
    totalConns := stats.TotalConns()
    maxConns := stats.MaxConns()
    utilizationPct := float64(totalConns) / float64(maxConns) * 100

    metrics := map[string]interface{}{
        "total_conns":    totalConns,
        "acquired_conns": stats.AcquiredConns(),
        "idle_conns":     stats.IdleConns(),
        "max_conns":      maxConns,
        "utilization_pct": utilizationPct,
    }

    // 3. Determine health based on utilization
    if utilizationPct > 90 {
        return HealthCheck{
            Status:  "unhealthy",
            Message: "Connection pool near exhaustion",
            Metrics: metrics,
        }
    }

    if utilizationPct > 80 {
        return HealthCheck{
            Status:  "degraded",
            Message: "Connection pool utilization high",
            Metrics: metrics,
        }
    }

    return HealthCheck{
        Status:  "healthy",
        Metrics: metrics,
    }
}

func (h *Handler) checkRedis(ctx context.Context) HealthCheck {
    if err := h.redis.Ping(ctx).Err(); err != nil {
        slog.Error("health_check_redis_ping_failed", "error", err)
        return HealthCheck{
            Status:  "unhealthy",
            Message: "Redis ping failed",
        }
    }

    // Get Redis memory usage
    info, err := h.redis.Info(ctx, "memory").Result()
    if err != nil {
        return HealthCheck{
            Status: "healthy",  // Degraded but not critical
        }
    }

    return HealthCheck{
        Status: "healthy",
        Metrics: map[string]interface{}{
            "info": info,
        },
    }
}

App Runner Health Check Configuration:

yaml

# App Runner service configuration
HealthCheckConfiguration:
  Path: /health
  Interval: 10
  Timeout: 5
  HealthyThreshold: 1
  UnhealthyThreshold: 3

5.1 Key Rotation Automation

File: docs/reference/key-rotation.md

markdown

# Encryption Key Rotation Procedure

## Schedule

**Frequency:** Quarterly (every 90 days)
**Calendar:** Set recurring reminders on:
- February 1 (Q1)
- May 1 (Q2)
- August 1 (Q3)
- November 1 (Q4)

## Automated Reminder System

### Option A: GitHub Actions (Recommended)

**.github/workflows/key-rotation-reminder.yml**

\`\`\`yaml
name: Key Rotation Reminder

on:
  schedule:
    # First day of every quarter at 9am UTC
    - cron: '0 9 1 2,5,8,11 *'

jobs:
  reminder:
    runs-on: ubuntu-latest
    steps:
      - name: Send Slack Notification
        uses: slackapi/[email protected]
        with:
          payload: |
            {
              "text": "🔐 QUARTERLY KEY ROTATION DUE",
              "blocks": [
                {
                  "type": "header",
                  "text": {
                    "type": "plain_text",
                    "text": "🔐 Encryption Key Rotation Required"
                  }
                },
                {
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
                    "text": "*Action Required:* Rotate encryption keys according to HIPAA compliance schedule.\n\n*Procedure:* See docs/reference/key-rotation.md"
                  }
                },
                {
                  "type": "section",
                  "fields": [
                    {
                      "type": "mrkdwn",
                      "text": "*Deadline:* Within 7 days"
                    },
                    {
                      "type": "mrkdwn",
                      "text": "*Assignee:* @security-team"
                    }
                  ]
                }
              ]
            }
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SECURITY_WEBHOOK }}
\`\`\`

### Option B: Calendar Automation

**Google Calendar / Outlook:**
1. Create recurring event "Encryption Key Rotation" on first day of Q1/Q2/Q3/Q4
2. Set reminders:
   - 7 days before
   - 3 days before
   - Day of
3. Invite: security team, DevOps lead, CTO

## Step-by-Step Rotation

### Pre-Rotation Checklist

- [ ] Verify backup systems are functioning
- [ ] Identify all tables with encrypted columns (column-level encryption is reserved for `pii_regulated` and `auth_secret` per [decisions.md → Why most PII is plaintext](/architecture/decisions#why-most-pii-is-plaintext-and-what-isn-t)):
  - `organization_billing.tax_id_encrypted` (CUI — `pii_regulated`)
  - `organization_integrations.api_key_encrypted` (planned — `auth_secret`)
  - Future webhook signing secrets and integration tokens (`auth_secret`)
- [ ] Estimate re-encryption time (500 rows/sec baseline)
- [ ] Schedule maintenance window if needed (typically not required)

### Rotation Steps

**1. Generate New Key**

\`\`\`bash
# Generate 32-byte (256-bit) key
openssl rand -hex 32 > new-encryption-key.txt

# Securely store in password manager
# DO NOT commit to git
\`\`\`

**2. Add New Key to Config**

\`\`\`bash
# AWS Secrets Manager (via CLI or console)
aws secretsmanager update-secret --secret-id restartix/prod --secret-string '{"ENCRYPTION_KEY_V2":"'$(cat new-encryption-key.txt)'","ENCRYPTION_CURRENT_VERSION":"2"}'

# Keep old key for decryption
# ENCRYPTION_KEY_V1 remains unchanged in the secret
\`\`\`

**3. Deploy with Multi-Version Support**

\`\`\`bash
# Deploy Core API with both keys available (push to main triggers GitHub Actions → ECR → App Runner auto-deploy)
git push origin main

# Verify deployment
curl https://api.restartix.com/health
\`\`\```

**4. Run Re-Encryption Job**

\`\`\`bash
# Run as a one-off task (e.g., via ECS RunTask or locally with production credentials)
go run cmd/tools/reencrypt/main.go

# Monitor progress
tail -f reencrypt.log
\`\`\`

**5. Verify Re-Encryption**

\`\`\`sql
-- Check organization_billing tax IDs are using the active key version
SELECT COUNT(*) AS old_key_count
FROM organization_billing
WHERE tax_id_encrypted IS NOT NULL
  AND get_byte(tax_id_encrypted, 0) != 2;  -- Key version 2

-- Should return 0

-- Check organization_integrations table (when it ships)
SELECT COUNT(*) AS old_key_count
FROM organization_integrations
WHERE api_key_encrypted IS NOT NULL
  AND get_byte(api_key_encrypted, 0) != 2;

-- Should return 0
\`\`\`

**6. Remove Old Key**

\`\`\`bash
# Wait 24 hours to ensure no issues
# Then remove old key from AWS Secrets Manager
aws secretsmanager update-secret --secret-id restartix/prod --secret-string '{"ENCRYPTION_KEY_V1": null}'

# Redeploy (push to main triggers GitHub Actions → ECR → App Runner auto-deploy)
git push origin main
\`\`\`

**7. Audit Log Entry**

\`\`\`sql
INSERT INTO audit_log (
    organization_id, user_id, action, entity_type, entity_id,
    changes, created_at, action_context
) VALUES (
    NULL,  -- System-level action
    (SELECT user_id FROM platform_roles WHERE role = 'superadmin' LIMIT 1),
    'KEY_ROTATION',
    'encryption_key',
    2,  -- New key version
    '{"old_version": 1, "new_version": 2, "rotated_at": "2026-02-13T10:00:00Z"}'::jsonb,
    NOW(),
    'compliance_maintenance'
);
\`\`\`

### Post-Rotation

- [ ] Verify all encrypted data readable
- [ ] Update compliance documentation
- [ ] Update BAA documentation (if key rotation affects it)
- [ ] Securely destroy old key (after 30 days)
- [ ] Schedule next rotation (90 days from now)

## Emergency Key Rotation

**Trigger:** Suspected key compromise

**Procedure:**
1. Follow rotation steps above IMMEDIATELY
2. Notify compliance officer
3. Document incident in `break_glass_log` table
4. Review all audit logs for unauthorized access
5. File security incident report

5.2 De-Identification Enforcement

File: internal/jobs/validate_test_data.go

package jobs

import (
    "context"
    "fmt"
    "log/slog"
    "regexp"

    "github.com/jackc/pgx/v5/pgxpool"
)

// ValidateTestData ensures no production PHI exists in non-prod environments
// Run this in CI/CD before deploying to staging
func ValidateTestData(ctx context.Context, db *pgxpool.Pool, env string) error {
    if env == "production" {
        return nil  // Skip in production
    }

    slog.Info("validating_test_data", "env", env)

    violations := make([]string, 0)

    // 1. Check for real email domains
    realEmails, err := db.Query(ctx, `
        SELECT email FROM users
        WHERE email NOT LIKE '%@example.com'
          AND email NOT LIKE '%@test.com'
          AND email NOT LIKE '%@restartix.test'
        LIMIT 10
    `)
    if err != nil {
        return fmt.Errorf("check emails: %w", err)
    }
    defer realEmails.Close()

    for realEmails.Next() {
        var email string
        realEmails.Scan(&email)
        violations = append(violations, fmt.Sprintf("Real email found: %s", email))
    }

    // 2. Check for real phone numbers (not in test ranges)
    // Romanian test range: +40700000XXX
    realPhones, err := db.Query(ctx, `
        SELECT name FROM patient_profiles
        WHERE phone IS NOT NULL
          AND phone NOT LIKE '+40700000%'
        LIMIT 5  -- Just check a sample
    `)
    if err != nil {
        return fmt.Errorf("check phones: %w", err)
    }
    defer realPhones.Close()

    // Note: We can't decrypt in this check without the encryption key
    // but we can verify the test data generation script was run
    var patientCount int
    db.QueryRow(ctx, "SELECT COUNT(*) FROM patients").Scan(&patientCount)
    if patientCount == 0 {
        violations = append(violations, "No test patients found - database may contain production data")
    }

    // 3. Check organization names
    realOrgs, err := db.Query(ctx, `
        SELECT name FROM organizations
        WHERE name NOT LIKE '%Test%'
          AND name NOT LIKE '%Demo%'
          AND name NOT LIKE '%Example%'
        LIMIT 10
    `)
    if err != nil {
        return fmt.Errorf("check orgs: %w", err)
    }
    defer realOrgs.Close()

    for realOrgs.Next() {
        var name string
        realOrgs.Scan(&name)
        // Allow "Acme Clinic" style test names
        if !regexp.MustCompile(`(?i)(acme|widgets|corp|inc test|sample)`).MatchString(name) {
            violations = append(violations, fmt.Sprintf("Suspicious org name: %s", name))
        }
    }

    // 4. Fail if violations found
    if len(violations) > 0 {
        slog.Error("test_data_validation_failed",
            "env", env,
            "violations", violations,
        )
        return fmt.Errorf("test data validation failed: %d violations", len(violations))
    }

    slog.Info("test_data_validation_passed", "env", env)
    return nil
}

CI/CD Integration:

yaml

# .github/workflows/deploy-staging.yml

name: Deploy to Staging

on:
  push:
    branches: [develop]

jobs:
  validate-test-data:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - name: Setup Go
        uses: actions/setup-go@v4
        with:
          go-version: '1.23'

      - name: Validate Test Data
        env:
          DATABASE_URL: ${{ secrets.STAGING_DATABASE_URL }}
        run: |
          go run cmd/tools/validate-test-data/main.go

      - name: Fail if PHI detected
        if: failure()
        run: |
          echo "❌ PRODUCTION PHI DETECTED IN STAGING DATABASE"
          echo "This is a HIPAA violation. Deployment blocked."
          exit 1

  deploy:
    needs: validate-test-data
    runs-on: ubuntu-latest
    steps:
      # ... deployment steps ...

5.3 Webhook Payload PHI Audit

File: internal/webhook/payload_validator.go

package webhook

import (
    "encoding/json"
    "fmt"
    "log/slog"
    "regexp"
    "strings"
)

// PHIPatterns are regex patterns that might indicate PHI in webhook payloads
var PHIPatterns = []*regexp.Regexp{
    // Email addresses (should not be in webhooks)
    regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`),

    // Phone numbers (various formats)
    regexp.MustCompile(`\+?[0-9]{10,15}`),
    regexp.MustCompile(`\d{3}[-.\s]?\d{3}[-.\s]?\d{4}`),

    // Potential names (capitalized words, 2+ words)
    // This is a heuristic and may have false positives
    regexp.MustCompile(`\b[A-Z][a-z]+ [A-Z][a-z]+\b`),

    // Medical record numbers (patterns like MRN-12345)
    regexp.MustCompile(`(?i)(mrn|patient[_-]?id|record[_-]?number)[:\s]*[0-9]+`),
}

// SensitiveFields are field names that should never appear in webhook payloads
var SensitiveFields = []string{
    "email", "phone", "emergency_contact_phone", "name", "address",
    "ssn", "tax_id", "birth_date", "medical_record_number",
    "diagnosis", "prescription", "treatment",
}

// ValidatePayload checks if a webhook payload contains PHI
// Returns error if PHI detected
func ValidatePayload(payload map[string]interface{}) error {
    violations := make([]string, 0)

    // Convert to JSON string for pattern matching
    jsonBytes, _ := json.Marshal(payload)
    jsonStr := string(jsonBytes)

    // Check for PHI patterns
    for _, pattern := range PHIPatterns {
        if matches := pattern.FindAllString(jsonStr, -1); len(matches) > 0 {
            violations = append(violations, fmt.Sprintf("Pattern match: %v", matches))
        }
    }

    // Check for sensitive field names
    violations = append(violations, checkSensitiveFields(payload, "")...)

    if len(violations) > 0 {
        slog.Error("webhook_phi_violation",
            "violations", violations,
            "payload_preview", jsonStr[:min(200, len(jsonStr))],
        )
        return fmt.Errorf("PHI detected in webhook payload: %d violations", len(violations))
    }

    return nil
}

// checkSensitiveFields recursively checks for sensitive field names
func checkSensitiveFields(data interface{}, path string) []string {
    violations := make([]string, 0)

    switch v := data.(type) {
    case map[string]interface{}:
        for key, value := range v {
            fieldPath := key
            if path != "" {
                fieldPath = path + "." + key
            }

            // Check if field name is sensitive
            for _, sensitive := range SensitiveFields {
                if strings.Contains(strings.ToLower(key), sensitive) {
                    violations = append(violations, fmt.Sprintf("Sensitive field: %s", fieldPath))
                }
            }

            // Recurse into nested objects
            violations = append(violations, checkSensitiveFields(value, fieldPath)...)
        }

    case []interface{}:
        for i, item := range v {
            violations = append(violations, checkSensitiveFields(item, fmt.Sprintf("%s[%d]", path, i))...)
        }
    }

    return violations
}

// ApprovedWebhookPayload is a safe payload structure (whitelist approach)
type ApprovedWebhookPayload struct {
    Event         string                 `json:"event"`          // e.g., "appointment.created"
    EntityType    string                 `json:"entity_type"`    // e.g., "appointment"
    EntityID      int64                  `json:"entity_id"`      // Database ID only
    EntityUID     string                 `json:"entity_uid"`     // UUID only
    OrganizationID int64                 `json:"organization_id"`
    Timestamp     string                 `json:"timestamp"`      // ISO 8601
    Metadata      map[string]interface{} `json:"metadata,omitempty"` // Safe metadata only
}

// ToApprovedPayload converts an internal event to a safe webhook payload
func ToApprovedPayload(event *Event) (*ApprovedWebhookPayload, error) {
    payload := &ApprovedWebhookPayload{
        Event:          event.Type,
        EntityType:     event.EntityType,
        EntityID:       event.EntityID,
        EntityUID:      event.EntityUID,
        OrganizationID: event.OrganizationID,
        Timestamp:      event.Timestamp.Format(time.RFC3339),
    }

    // Only include safe metadata (IDs, statuses, timestamps - NO PHI)
    safeMetadata := make(map[string]interface{})
    if event.EntityType == "appointment" {
        safeMetadata["status"] = event.Metadata["status"]
        safeMetadata["started_at"] = event.Metadata["started_at"]
    }
    payload.Metadata = safeMetadata

    // Validate before sending
    payloadMap := make(map[string]interface{})
    bytes, _ := json.Marshal(payload)
    json.Unmarshal(bytes, &payloadMap)

    if err := ValidatePayload(payloadMap); err != nil {
        return nil, err
    }

    return payload, nil
}

Automated Testing:

// internal/webhook/payload_validator_test.go

package webhook

import (
    "testing"

    "github.com/stretchr/testify/assert"
)

func TestValidatePayload_DetectsPHI(t *testing.T) {
    tests := []struct {
        name        string
        payload     map[string]interface{}
        expectError bool
    }{
        {
            name: "safe payload with IDs only",
            payload: map[string]interface{}{
                "event":           "appointment.created",
                "entity_id":       123,
                "entity_uid":      "550e8400-e29b-41d4-a716-446655440000",
                "organization_id": 1,
            },
            expectError: false,
        },
        {
            name: "unsafe payload with email",
            payload: map[string]interface{}{
                "event":      "appointment.created",
                "patient_email": "[email protected]",
            },
            expectError: true,
        },
        {
            name: "unsafe payload with phone",
            payload: map[string]interface{}{
                "event": "appointment.created",
                "phone": "+40700000123",
            },
            expectError: true,
        },
        {
            name: "unsafe payload with name",
            payload: map[string]interface{}{
                "event":        "appointment.created",
                "patient_name": "John Doe",
            },
            expectError: true,
        },
        {
            name: "unsafe payload with nested PHI",
            payload: map[string]interface{}{
                "event": "appointment.created",
                "data": map[string]interface{}{
                    "patient": map[string]interface{}{
                        "email": "[email protected]",
                    },
                },
            },
            expectError: true,
        },
    }

    for _, tt := range tests {
        t.Run(tt.name, func(t *testing.T) {
            err := ValidatePayload(tt.payload)
            if tt.expectError {
                assert.Error(t, err, "Expected PHI violation to be detected")
            } else {
                assert.NoError(t, err, "Expected payload to be safe")
            }
        })
    }
}

CI/CD Integration:

yaml

# .github/workflows/test.yml

- name: Run Webhook PHI Audit Tests
  run: go test ./internal/webhook -v -run TestValidatePayload

# Fail deployment if tests don't pass
- name: Check for webhook PHI violations
  if: failure()
  run: |
    echo "❌ Webhook payload validation failed"
    echo "Review webhook payloads for PHI leakage"
    exit 1

6. Documentation Updates

Update 04-auth-and-security.md

Add this section at line 493 (after BAA checklist):

markdown

### Key Rotation Enforcement

**Automated Reminders:** GitHub Actions workflow sends quarterly reminders to security team
**Procedure:** See [Key Rotation Procedure](./key-rotation.md)
**Audit Trail:** All rotations logged in `audit_log` table with `action_context = 'compliance_maintenance'`

**Last rotation:** (tracked in audit_log)
**Next rotation due:** (automated reminder system)

### Test Data Enforcement

**Validation:** Automated CI/CD checks prevent production PHI in staging
**Procedure:** See `internal/jobs/validate_test_data.go`
**Enforcement:** Deployment blocked if real email domains or PHI patterns detected

### Webhook PHI Audit

**Validation:** All webhook payloads validated before sending
**Implementation:** See `internal/webhook/payload_validator.go`
**Testing:** Automated tests ensure no PHI in webhook payloads
**Whitelist Approach:** Only approved fields (IDs, UIDs, timestamps, statuses) allowed

Implementation Checklist

Week 1: Critical Monitoring

[ ] Implement internal/observability/pool_metrics.go
[ ] Implement internal/middleware/query_timeout.go
[ ] Implement internal/middleware/query_tracer.go
[ ] Update internal/health/handler.go with pool metrics
[ ] Add metrics to cmd/api/main.go
[ ] Deploy to staging and verify logging
[ ] Set up Datadog/CloudWatch dashboards

[ ] Create docs/reference/key-rotation.md
[ ] Set up GitHub Actions key rotation reminder
[ ] Implement internal/jobs/validate_test_data.go
[ ] Implement internal/webhook/payload_validator.go
[ ] Add webhook payload tests
[ ] Update CI/CD with validation steps
[ ] Test key rotation procedure in staging

Week 3: Documentation & Training

[ ] Update docs/04-auth-and-security.md with new sections
[ ] Document Datadog alert thresholds
[ ] Create runbook for connection pool issues
[ ] Train team on monitoring dashboards
[ ] Schedule first key rotation (add to calendar)

Success Metrics

Metric	Target	Measurement
Connection pool monitoring	100% uptime	Datadog dashboard
Slow query detection	< 1% queries > 500ms	Query tracer logs
Key rotation compliance	100% on-time	Audit log entries
Test data validation	0 PHI violations	CI/CD pass rate
Webhook PHI audit	0 PHI in payloads	Automated tests

Next Steps

After completing these immediate actions:

Phase 2 Planning: Begin planning read replica migration (when > 10 orgs)
Audit Log Partitioning: Implement monthly partitions (when > 1M audit log rows)
Dedicated-Mode Provisioning: Build the runtime + ops templating for the dedicated tenancy mode (when first paying dedicated-mode clinic contract closes — see features/platform/tenant-isolation.md)

Immediate Actions: Production Readiness ​

1. Connection Pool Monitoring (CRITICAL) ​

Why This Matters ​

Implementation ​

Datadog/CloudWatch Integration ​

2. Query Timeout Middleware (CRITICAL) ​

Why This Matters ​

Implementation ​

Query-Level Timeouts ​

3. Query Performance Tracing ​

Why This Matters ​

Implementation ​

4. Health Check with Pool Metrics ​

Why This Matters ​

Implementation ​

5. HIPAA/GDPR Compliance Fixes ​

5.1 Key Rotation Automation ​

5.2 De-Identification Enforcement ​

5.3 Webhook Payload PHI Audit ​

6. Documentation Updates ​

Update 04-auth-and-security.md ​

Implementation Checklist ​

Week 1: Critical Monitoring ​

Week 2: HIPAA/GDPR Compliance ​

Week 3: Documentation & Training ​

Success Metrics ​

Next Steps ​

Immediate Actions: Production Readiness

1. Connection Pool Monitoring (CRITICAL)

Why This Matters

Implementation

Datadog/CloudWatch Integration

2. Query Timeout Middleware (CRITICAL)

Why This Matters

Implementation

Query-Level Timeouts

3. Query Performance Tracing

Why This Matters

Implementation

4. Health Check with Pool Metrics

Why This Matters

Implementation

5. HIPAA/GDPR Compliance Fixes

5.1 Key Rotation Automation

5.2 De-Identification Enforcement

5.3 Webhook Payload PHI Audit

6. Documentation Updates

Update 04-auth-and-security.md

Implementation Checklist

Week 1: Critical Monitoring

Week 2: HIPAA/GDPR Compliance

Week 3: Documentation & Training

Success Metrics

Next Steps