{
  "schemaVersion": "2021-11-01",
  "name": "Snowflake-AWS-WAF-Lens",
  "description": "A custom lens for the AWS Well-Architected Framework that provides organizations with a structured, repeatable review mechanism to evaluate Snowflake deployments on AWS against proven best practices spanning security, reliability, performance efficiency, cost optimization, operational excellence, and sustainability.",
  "pillars": [
    {
      "id": "operationalExcellence",
      "name": "Operational Excellence",
      "questions": [
        {
          "id": "SFOPS01",
          "title": "How do you monitor Snowflake resource consumption and credit usage?",
          "description": "Resource monitors in Snowflake allow you to set credit quotas and alerts at the account and warehouse level. Proper configuration ensures visibility into compute spend and prevents unexpected cost overruns.",
          "choices": [
            {
              "id": "SFOPS01_BP01",
              "title": "Configure account-level and warehouse-level resource monitors",
              "helpfulResource": {
                "displayText": "Snowflake resource monitors enable you to set credit quotas at the account or individual warehouse level. Configure monitors with multiple threshold percentages (e.g., 75%, 90%, 100%) and define actions such as notify, notify and suspend, or notify and suspend immediately to control credit consumption proactively.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              },
              "improvementPlan": {
                "displayText": "Create resource monitors at both the account level and for each production virtual warehouse. Set notification thresholds at 75% and 90% of quota. Configure suspend actions at 100% for non-critical warehouses. Review and adjust quotas monthly based on usage trends from the ACCOUNT_USAGE.WAREHOUSE_METERING_HISTORY view.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              }
            },
            {
              "id": "SFOPS01_BP02",
              "title": "Set up alerting and notification channels for resource monitor triggers",
              "helpfulResource": {
                "displayText": "Integrate Snowflake resource monitor alerts with your organization's notification infrastructure (e.g., email, Amazon SNS, or PagerDuty) to ensure the right teams are notified when credit consumption approaches defined thresholds.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              },
              "improvementPlan": {
                "displayText": "Configure email notifications for all resource monitor thresholds. Integrate alerts with your centralized monitoring platform using Snowflake's notification integrations. Establish an escalation path for critical threshold breaches. Test alert delivery regularly to confirm notifications reach the appropriate on-call teams.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              }
            },
            {
              "id": "SFOPS01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFOPS01_BP01 && SFOPS01_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFOPS01_BP01 || SFOPS01_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFOPS02",
          "title": "How do you leverage query history and audit logging for operational visibility?",
          "description": "Snowflake's ACCOUNT_USAGE schema provides rich historical data on queries, logins, warehouse usage, and data access patterns. Leveraging these views is essential for operational troubleshooting, performance tuning, and compliance auditing.",
          "choices": [
            {
              "id": "SFOPS02_BP01",
              "title": "Enable and regularly review ACCOUNT_USAGE query history and access history views",
              "helpfulResource": {
                "displayText": "The ACCOUNT_USAGE schema includes views such as QUERY_HISTORY, LOGIN_HISTORY, ACCESS_HISTORY, and WAREHOUSE_METERING_HISTORY. These views provide up to 365 days of historical data and are essential for understanding workload patterns, identifying anomalies, and supporting audit requirements.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage"
              },
              "improvementPlan": {
                "displayText": "Grant the SNOWFLAKE database IMPORTED PRIVILEGES role to your monitoring and audit teams. Create scheduled queries or dashboards that surface key metrics from QUERY_HISTORY (e.g., long-running queries, failed queries) and LOGIN_HISTORY (e.g., failed login attempts). Retain query results for compliance by exporting to an S3-based data lake if retention beyond 365 days is required.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage"
              }
            },
            {
              "id": "SFOPS02_BP02",
              "title": "Implement centralized logging by exporting Snowflake audit data to AWS services",
              "helpfulResource": {
                "displayText": "Export Snowflake audit and usage data to Amazon S3 and integrate with AWS services such as Amazon CloudWatch, AWS CloudTrail Lake, or Amazon OpenSearch for centralized log analysis, correlation with AWS-side events, and long-term retention.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage"
              },
              "improvementPlan": {
                "displayText": "Create Snowflake tasks that periodically export ACCOUNT_USAGE data to an external S3 stage. Configure AWS Glue or Athena to catalog and query the exported data. Build unified dashboards in Amazon QuickSight or your preferred BI tool that correlate Snowflake operational metrics with AWS infrastructure metrics.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage"
              }
            },
            {
              "id": "SFOPS02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFOPS02_BP01 && SFOPS02_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFOPS02_BP01 || SFOPS02_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFOPS03",
          "title": "How do you monitor and manage Snowflake tasks and data pipelines?",
          "description": "Snowflake tasks and Snowpipe enable automated data ingestion and transformation workflows. Monitoring these components ensures data freshness, pipeline reliability, and timely detection of failures.",
          "choices": [
            {
              "id": "SFOPS03_BP01",
              "title": "Monitor task execution history and configure failure alerting",
              "helpfulResource": {
                "displayText": "Use the TASK_HISTORY table function and ACCOUNT_USAGE.TASK_HISTORY view to monitor task execution status, duration, and error messages. Configure alerts for task failures to enable rapid remediation and prevent data pipeline disruptions.",
                "url": "https://docs.snowflake.com/en/user-guide/tasks-intro"
              },
              "improvementPlan": {
                "displayText": "Query TASK_HISTORY regularly to identify failed or long-running tasks. Set up Snowflake alerts (using the CREATE ALERT command) that trigger on task failure conditions. Integrate alert notifications with your incident management system. Establish SLAs for task completion times and monitor adherence.",
                "url": "https://docs.snowflake.com/en/user-guide/tasks-intro"
              }
            },
            {
              "id": "SFOPS03_BP02",
              "title": "Implement end-to-end pipeline observability including Snowpipe monitoring",
              "helpfulResource": {
                "displayText": "Monitor Snowpipe ingestion using the PIPE_USAGE_HISTORY and COPY_HISTORY views. Combine with AWS-side monitoring of S3 event notifications and SQS queues to achieve full pipeline observability from source to Snowflake table.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-snowpipe-intro"
              },
              "improvementPlan": {
                "displayText": "Create dashboards that track Snowpipe file ingestion latency, error rates, and throughput. Monitor the S3 event notification and SQS queue health on the AWS side. Implement data quality checks downstream of ingestion to validate row counts and schema conformance. Set up automated retry logic for transient ingestion failures.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-snowpipe-intro"
              }
            },
            {
              "id": "SFOPS03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFOPS03_BP01 && SFOPS03_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFOPS03_BP01 || SFOPS03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFOPS04",
          "title": "How do you apply CI/CD practices to Snowflake schema and object management?",
          "description": "Without version control and CI/CD, Snowflake schemas drift over time  -  ad-hoc DDL changes accumulate, environments diverge, and deployments become risky manual processes. Infrastructure-as-Code for Snowflake ensures reproducibility, auditability, and safe promotion across environments.",
          "choices": [
            {
              "id": "SFOPS04_BP01",
              "title": "Use DCM, dbt, or Terraform to manage Snowflake objects in version-controlled source code",
              "helpfulResource": {
                "displayText": "Store all Snowflake DDL (tables, views, warehouses, roles, grants) in Git using DCM (Snowflake native), dbt (for transform layer), or the Terraform Snowflake provider. Apply changes via a CI/CD pipeline with dev → staging → prod promotion gates and automated validation at each stage.",
                "url": "https://docs.snowflake.com/en/developer-guide/native-apps/dcm/about-dcm"
              },
              "improvementPlan": {
                "displayText": "Structure your repo: dcm/databases/, dcm/roles/, dcm/warehouses/. Run `dcm diff` in CI to preview changes. Require PR approval for any production schema change. Use `CREATE DATABASE dev_db CLONE prod_db` to spin up zero-cost dev environments for testing schema changes before promotion.",
                "url": "https://docs.snowflake.com/en/developer-guide/native-apps/dcm/about-dcm"
              }
            },
            {
              "id": "SFOPS04_BP02",
              "title": "Use zero-copy clone environments for integration testing before deploying schema changes to production",
              "helpfulResource": {
                "displayText": "Zero-copy clones of production databases provide full-fidelity test environments at near-zero cost. Run integration tests against a clone before applying schema changes to production. This eliminates the risk of untested DDL breaking production pipelines.",
                "url": "https://docs.snowflake.com/en/user-guide/object-clone"
              },
              "improvementPlan": {
                "displayText": "In CI pipeline: (1) CREATE DATABASE test_env CLONE prod_db; (2) run migration scripts against test_env; (3) execute integration tests; (4) if pass, promote to prod; (5) DROP DATABASE test_env. Automate with GitHub Actions or AWS CodePipeline calling Snowflake CLI (snow sql).",
                "url": "https://docs.snowflake.com/en/developer-guide/snowflake-cli/index"
              }
            },
            {
              "id": "SFOPS04_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFOPS04_BP01 && SFOPS04_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFOPS04_BP01 || SFOPS04_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFOPS05",
          "title": "How do you configure automated alerting for Snowflake pipeline and workload failures?",
          "description": "Silent pipeline failures  -  failed tasks, stale streams, pipe errors  -  go unnoticed without proactive alerting. Snowflake ALERT objects and notification integrations allow condition-based automated alerts that integrate with existing incident management systems.",
          "choices": [
            {
              "id": "SFOPS05_BP01",
              "title": "Create Snowflake ALERT objects for critical pipeline failure conditions",
              "helpfulResource": {
                "displayText": "Snowflake ALERT objects run on a schedule, evaluate a SQL condition, and trigger an action (email, notification integration) when the condition returns rows. Use them to monitor TASK_HISTORY for failures, PIPE_USAGE_HISTORY for ingest gaps, and QUERY_HISTORY for error spikes.",
                "url": "https://docs.snowflake.com/en/user-guide/alerts"
              },
              "improvementPlan": {
                "displayText": "Example: CREATE ALERT pipeline_failure WAREHOUSE=monitor_wh SCHEDULE='5 MINUTE' IF(EXISTS(SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.TASK_HISTORY WHERE STATE='FAILED' AND SCHEDULED_TIME > DATEADD('MINUTE',-5,CURRENT_TIMESTAMP()))) THEN CALL SYSTEM$SEND_EMAIL('ops@company.com','Pipeline failure detected','Check TASK_HISTORY'); ALTER ALERT pipeline_failure RESUME;",
                "url": "https://docs.snowflake.com/en/user-guide/alerts"
              }
            },
            {
              "id": "SFOPS05_BP02",
              "title": "Route Snowflake alerts to AWS SNS or external incident management via notification integrations",
              "helpfulResource": {
                "displayText": "Snowflake notification integrations connect ALERT triggers to AWS SNS topics, enabling routing to PagerDuty, OpsGenie, Slack, or any SNS subscriber. This unifies Snowflake pipeline alerts with your existing AWS-side CloudWatch alarm notification infrastructure.",
                "url": "https://docs.snowflake.com/en/user-guide/notifications/notification-integrations"
              },
              "improvementPlan": {
                "displayText": "CREATE NOTIFICATION INTEGRATION sf_to_sns TYPE=QUEUE NOTIFICATION_PROVIDER=AWS_SNS ENABLED=TRUE AWS_SNS_TOPIC_ARN='arn:aws:sns:us-west-2:<acct>:ops-alerts' AWS_SNS_ROLE_ARN='<iam_role_arn>'; Reference in alert action. Validate by manually triggering a test failure and confirming the SNS message is received.",
                "url": "https://docs.snowflake.com/en/user-guide/notifications/notification-integrations"
              }
            },
            {
              "id": "SFOPS05_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFOPS05_BP01 && SFOPS05_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFOPS05_BP01 || SFOPS05_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFOPS06",
          "title": "How do you establish a continuous improvement cadence for Snowflake workload efficiency?",
          "description": "Without a regular review process, slow queries accumulate, underutilized warehouses persist, and cost/performance drift goes undetected. A cadenced review of ACCOUNT_USAGE metrics drives ongoing optimization.",
          "choices": [
            {
              "id": "SFOPS06_BP01",
              "title": "Schedule regular QUERY_HISTORY reviews to identify and optimize expensive or slow queries",
              "helpfulResource": {
                "displayText": "Run weekly or bi-weekly reviews of ACCOUNT_USAGE.QUERY_HISTORY filtered to the top queries by TOTAL_ELAPSED_TIME, BYTES_SCANNED, or CREDITS_USED_CLOUD_SERVICES. Classify findings into: add clustering key, rewrite query, add Search Optimization, or cache result. Track resolution in your team backlog.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              },
              "improvementPlan": {
                "displayText": "Weekly query: SELECT query_text, total_elapsed_time/1000 AS elapsed_sec, bytes_scanned/1e9 AS gb_scanned, credits_used_cloud_services FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY WHERE start_time >= DATEADD('day',-7,CURRENT_TIMESTAMP()) AND execution_status='SUCCESS' ORDER BY total_elapsed_time DESC LIMIT 20; Assign optimization tasks from top results.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              }
            },
            {
              "id": "SFOPS06_BP02",
              "title": "Use Snowflake's built-in Query Profile and Streamlit dashboards to surface warehouse and pipeline inefficiencies",
              "helpfulResource": {
                "displayText": "Query Profile (in Snowsight) visualizes per-operator execution times, identifying spills, full scans, and join explosions. Build a Streamlit in Snowflake dashboard over WAREHOUSE_LOAD_HISTORY and QUERY_HISTORY for continuous ops visibility without exporting data.",
                "url": "https://docs.snowflake.com/en/user-guide/ui-query-profile"
              },
              "improvementPlan": {
                "displayText": "Build a Streamlit ops dashboard: metrics = [avg queue time by warehouse, top 10 slow queries by day, credit burn rate vs budget, task failure rate, pipe ingest lag]. Schedule a 30-min monthly review with the team to triage findings. Create Jira/GitHub issues for any item taking > 2x expected time.",
                "url": "https://docs.snowflake.com/en/developer-guide/streamlit/about-streamlit"
              }
            },
            {
              "id": "SFOPS06_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFOPS06_BP01 && SFOPS06_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFOPS06_BP01 || SFOPS06_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    },
    {
      "id": "security",
      "name": "Security",
      "questions": [
        {
          "id": "SFSEC01",
          "title": "How do you secure network connectivity to your Snowflake account?",
          "description": "Network security for Snowflake on AWS involves configuring network policies to restrict access by IP address and establishing private connectivity via AWS PrivateLink to eliminate data traversal over the public internet.",
          "choices": [
            {
              "id": "SFSEC01_BP01",
              "title": "Configure Snowflake network policies to restrict access by IP allowlist",
              "helpfulResource": {
                "displayText": "Snowflake network policies allow you to define allowed and blocked IP address ranges at the account or user level. This ensures only traffic from known corporate networks, VPNs, or AWS VPC CIDR ranges can reach your Snowflake account.",
                "url": "https://docs.snowflake.com/en/user-guide/network-policies"
              },
              "improvementPlan": {
                "displayText": "Create a network policy that allowlists your corporate IP ranges and AWS VPC NAT gateway IPs. Apply the policy at the account level for broad enforcement. Test connectivity from allowed and blocked IPs before enforcing. Review and update the allowlist quarterly or when network infrastructure changes.",
                "url": "https://docs.snowflake.com/en/user-guide/network-policies"
              }
            },
            {
              "id": "SFSEC01_BP02",
              "title": "Establish AWS PrivateLink connectivity to Snowflake",
              "helpfulResource": {
                "displayText": "AWS PrivateLink for Snowflake creates a private, dedicated connection between your AWS VPC and Snowflake, ensuring that traffic never traverses the public internet. This reduces exposure to network-based threats and satisfies compliance requirements for private connectivity.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security-privatelink"
              },
              "improvementPlan": {
                "displayText": "Provision a VPC endpoint for Snowflake in each AWS region where your workloads operate. Update DNS resolution to route Snowflake traffic through the PrivateLink endpoint. Configure security groups on the VPC endpoint to restrict access to authorized subnets. Validate that all application and ETL connections use the PrivateLink URL.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security-privatelink"
              }
            },
            {
              "id": "SFSEC01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSEC01_BP01 && SFSEC01_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSEC01_BP01 || SFSEC01_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSEC02",
          "title": "How do you implement role-based access control and privilege minimization in Snowflake?",
          "description": "Snowflake's Role-Based Access Control (RBAC) model governs access to all securable objects. Implementing least-privilege access through a well-designed role hierarchy is critical to preventing unauthorized data access and limiting blast radius of compromised credentials.",
          "choices": [
            {
              "id": "SFSEC02_BP01",
              "title": "Design and enforce a role hierarchy following least-privilege principles",
              "helpfulResource": {
                "displayText": "Create a structured role hierarchy that separates functional roles (e.g., data_reader, data_writer, data_admin) from system roles (SYSADMIN, SECURITYADMIN, ACCOUNTADMIN). Grant privileges to functional roles and assign users to the most restrictive role that meets their needs.",
                "url": "https://docs.snowflake.com/en/user-guide/security-access-control-overview"
              },
              "improvementPlan": {
                "displayText": "Audit existing role grants using the GRANTS_TO_ROLES and GRANTS_TO_USERS views. Identify and remediate over-privileged users. Implement a naming convention for roles that reflects their purpose and scope. Ensure all custom roles roll up to SYSADMIN. Restrict ACCOUNTADMIN to break-glass scenarios only with MFA enforcement.",
                "url": "https://docs.snowflake.com/en/user-guide/security-access-control-overview"
              }
            },
            {
              "id": "SFSEC02_BP02",
              "title": "Conduct regular access reviews and privilege audits",
              "helpfulResource": {
                "displayText": "Periodically review role assignments and privilege grants to ensure they remain aligned with current job functions. Use ACCOUNT_USAGE views such as GRANTS_TO_USERS and ACCESS_HISTORY to identify dormant accounts, excessive privileges, and unusual access patterns.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage"
              },
              "improvementPlan": {
                "displayText": "Establish a quarterly access review cadence. Generate reports from GRANTS_TO_USERS and GRANTS_TO_ROLES to identify users with privileges beyond their current role requirements. Revoke unused grants and disable dormant accounts. Document the review process and findings for compliance evidence.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage"
              }
            },
            {
              "id": "SFSEC02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSEC02_BP01 && SFSEC02_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSEC02_BP01 || SFSEC02_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSEC03",
          "title": "How do you manage data encryption and key management for Snowflake?",
          "description": "Snowflake encrypts all data at rest and in transit by default. For organizations requiring customer-managed keys, Tri-Secret Secure provides an additional layer of encryption control by combining a Snowflake-managed key with a customer-managed key in AWS KMS.",
          "choices": [
            {
              "id": "SFSEC03_BP01",
              "title": "Validate default encryption and enable Tri-Secret Secure for enhanced key control",
              "helpfulResource": {
                "displayText": "Snowflake automatically encrypts all data at rest using AES-256 and all data in transit using TLS 1.2+. For organizations with regulatory requirements for customer-managed encryption keys, Tri-Secret Secure combines a Snowflake-managed key with a customer-managed key stored in AWS KMS, ensuring that neither party alone can access the data.",
                "url": "https://docs.snowflake.com/en/user-guide/security-encryption"
              },
              "improvementPlan": {
                "displayText": "Confirm that TLS 1.2 or higher is enforced for all client connections. For workloads requiring customer-managed keys, enable Tri-Secret Secure by creating a KMS key in your AWS account and configuring it with Snowflake. Implement key rotation policies in AWS KMS. Monitor key usage via AWS CloudTrail and set alerts for unauthorized key access attempts.",
                "url": "https://docs.snowflake.com/en/user-guide/security-encryption"
              }
            },
            {
              "id": "SFSEC03_BP02",
              "title": "Implement key rotation and lifecycle management",
              "helpfulResource": {
                "displayText": "Snowflake automatically rotates its managed encryption keys. When using Tri-Secret Secure, you must also manage the rotation lifecycle of your customer-managed key in AWS KMS to maintain compliance with your organization's cryptographic policies.",
                "url": "https://docs.snowflake.com/en/user-guide/security-encryption"
              },
              "improvementPlan": {
                "displayText": "Enable automatic annual key rotation in AWS KMS for your Snowflake customer-managed key. Document the key rotation schedule and test the rotation process in a non-production environment. Establish a runbook for emergency key revocation scenarios. Monitor key age and rotation status via AWS Config rules.",
                "url": "https://docs.snowflake.com/en/user-guide/security-encryption"
              }
            },
            {
              "id": "SFSEC03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSEC03_BP01 && SFSEC03_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSEC03_BP01 || SFSEC03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSEC04",
          "title": "How do you implement data governance controls in Snowflake?",
          "description": "Snowflake provides dynamic data masking, row access policies, and object tagging to enforce fine-grained data governance. These controls ensure sensitive data is protected based on the role and context of the user accessing it.",
          "choices": [
            {
              "id": "SFSEC04_BP01",
              "title": "Implement dynamic data masking policies for sensitive columns",
              "helpfulResource": {
                "displayText": "Dynamic data masking in Snowflake allows you to define masking policies that conditionally mask column values at query time based on the executing role. This ensures that sensitive data such as PII, PHI, or financial data is only visible to authorized roles without duplicating data or creating separate views.",
                "url": "https://docs.snowflake.com/en/user-guide/security-column-ddm-intro"
              },
              "improvementPlan": {
                "displayText": "Identify columns containing sensitive data across your Snowflake databases. Create masking policies that define full visibility for authorized roles and masked output for all others. Apply policies to columns using ALTER TABLE ... ALTER COLUMN ... SET MASKING POLICY. Test masking behavior across all consuming roles. Document the masking policy inventory and review quarterly.",
                "url": "https://docs.snowflake.com/en/user-guide/security-column-ddm-intro"
              }
            },
            {
              "id": "SFSEC04_BP02",
              "title": "Configure row access policies and object tagging for governance",
              "helpfulResource": {
                "displayText": "Row access policies restrict which rows a given role can see in a table, enabling multi-tenant data isolation or regional data residency enforcement. Object tagging allows you to classify Snowflake objects (databases, schemas, tables, columns) with metadata tags for governance tracking and policy enforcement.",
                "url": "https://docs.snowflake.com/en/user-guide/security-row-intro"
              },
              "improvementPlan": {
                "displayText": "Define row access policies for tables containing multi-tenant or regionally sensitive data. Use object tags to classify data sensitivity levels (e.g., PUBLIC, INTERNAL, CONFIDENTIAL, RESTRICTED). Create tag-based masking policies that automatically apply masking based on the sensitivity tag. Monitor tag coverage using the TAG_REFERENCES view to ensure all sensitive objects are classified.",
                "url": "https://docs.snowflake.com/en/user-guide/security-row-intro"
              }
            },
            {
              "id": "SFSEC04_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSEC04_BP01 && SFSEC04_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSEC04_BP01 || SFSEC04_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSEC05",
          "title": "How do you manage authentication for Snowflake users and service accounts?",
          "description": "Snowflake supports multiple authentication methods including MFA, SSO via SAML, OAuth, and key pair authentication. A robust authentication strategy reduces the risk of credential compromise and aligns with enterprise identity management standards.",
          "choices": [
            {
              "id": "SFSEC05_BP01",
              "title": "Enforce MFA for interactive users and SSO/SAML integration with your identity provider",
              "helpfulResource": {
                "displayText": "Enable multi-factor authentication (MFA) for all interactive Snowflake users, especially those with elevated privileges. Integrate Snowflake with your enterprise identity provider (e.g., Okta, Azure AD, AWS IAM Identity Center) via SAML 2.0 for single sign-on, centralizing authentication and enabling consistent access policies.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-overview"
              },
              "improvementPlan": {
                "displayText": "Enable MFA enrollment for all human users via ALTER USER ... SET MINS_TO_BYPASS_MFA. Configure SAML 2.0 federation with your identity provider. Enforce SSO as the primary authentication method by setting SAML_IDENTITY_PROVIDER at the account level. Disable password-based authentication for users who authenticate via SSO. Monitor LOGIN_HISTORY for non-SSO login attempts.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-overview"
              }
            },
            {
              "id": "SFSEC05_BP02",
              "title": "Use key pair authentication or OAuth for service accounts and programmatic access",
              "helpfulResource": {
                "displayText": "For service accounts, ETL tools, and programmatic access, use key pair authentication (RSA 2048-bit minimum) or OAuth tokens instead of embedded passwords. This eliminates the risk of password exposure in code repositories or configuration files.",
                "url": "https://docs.snowflake.com/en/user-guide/key-pair-auth"
              },
              "improvementPlan": {
                "displayText": "Generate RSA key pairs for each service account and configure the public key in Snowflake using ALTER USER ... SET RSA_PUBLIC_KEY. Store private keys securely in AWS Secrets Manager. Implement key rotation on a defined schedule (e.g., every 90 days). For OAuth-based access, configure Snowflake as an OAuth resource server and use your identity provider to issue scoped tokens. Audit service account authentication methods and eliminate any remaining password-based connections.",
                "url": "https://docs.snowflake.com/en/user-guide/key-pair-auth"
              }
            },
            {
              "id": "SFSEC05_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSEC05_BP01 && SFSEC05_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSEC05_BP01 || SFSEC05_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSEC06",
          "title": "How do you prepare for and respond to security incidents affecting your Snowflake account?",
          "description": "A Snowflake-specific incident response plan ensures that credential compromise, unauthorized access, or data exfiltration events are contained quickly. Without pre-defined procedures, response time increases and blast radius expands.",
          "choices": [
            {
              "id": "SFSEC06_BP01",
              "title": "Maintain a Snowflake-specific IR runbook covering credential rotation, account suspension, and forensic queries",
              "helpfulResource": {
                "displayText": "Document step-by-step procedures for: (1) rotate compromised RSA keys (ALTER USER UNSET RSA_PUBLIC_KEY; generate new keypair; update Secrets Manager); (2) suspend affected user (ALTER USER <user> SET DISABLED=TRUE); (3) review ACCESS_HISTORY and LOGIN_HISTORY for scope of access; (4) notify Snowflake Support if Fail-Safe recovery is needed.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security"
              },
              "improvementPlan": {
                "displayText": "Create an IR runbook with these Snowflake-specific steps and store it in your incident management system (Confluence, PagerDuty runbook). Test the runbook quarterly with a tabletop exercise. Validate that your team can execute RSA key rotation end-to-end (SM → Snowflake → Lambda redeployment) in < 30 minutes.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security"
              }
            },
            {
              "id": "SFSEC06_BP02",
              "title": "Enable and monitor Snowflake Access History and Login History for anomaly detection",
              "helpfulResource": {
                "displayText": "ACCOUNT_USAGE.ACCESS_HISTORY captures every table and column accessed by every query, including the user, role, warehouse, and bytes scanned. LOGIN_HISTORY captures all login attempts including failures, client type, and source IP. Regular review or automated alerting on these views provides early warning of unauthorized access.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/access_history"
              },
              "improvementPlan": {
                "displayText": "Create a Snowflake ALERT: IF(EXISTS(SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.LOGIN_HISTORY WHERE IS_SUCCESS='NO' AND EVENT_TIMESTAMP > DATEADD('MINUTE',-15,CURRENT_TIMESTAMP()) HAVING COUNT(*)>5)) THEN notify. Also alert on ACCOUNTADMIN role usage outside business hours via ACCESS_HISTORY joined with QUERY_HISTORY.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/login_history"
              }
            },
            {
              "id": "SFSEC06_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSEC06_BP01 && SFSEC06_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSEC06_BP01 || SFSEC06_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    },
    {
      "id": "reliability",
      "name": "Reliability",
      "questions": [
        {
          "id": "SFREL01",
          "title": "How do you configure Snowflake Time Travel and Fail-safe for business continuity?",
          "description": "FR-15: Snowflake Time Travel enables querying and restoring historical data within a configurable retention window (up to 90 days on Enterprise edition), while Fail-safe provides an additional 7-day recovery window managed by Snowflake. Proper configuration of these features ensures that accidental data loss, corruption, or destructive operations can be recovered without external backup infrastructure, forming the foundation of Snowflake-native business continuity.",
          "choices": [
            {
              "id": "SFREL01_BP01",
              "title": "Configure Time Travel retention periods based on data criticality tiers",
              "helpfulResource": {
                "displayText": "Snowflake Time Travel allows you to query historical data, clone objects at a point in time, and restore dropped objects within the DATA_RETENTION_TIME_IN_DAYS window. Enterprise edition supports up to 90 days of retention. Configure retention tiers based on data criticality: 90 days for critical production tables (regulatory, financial, analytics), 14-30 days for standard operational tables, and 1 day for staging and transient workloads. This tiered approach balances recovery capability with storage cost, as Time Travel data consumes additional storage.",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              },
              "improvementPlan": {
                "displayText": "1. Classify all databases and schemas into criticality tiers: CRITICAL (production analytics, regulatory, financial), STANDARD (operational reporting, internal dashboards), and TRANSIENT (staging, ETL scratch, temporary). 2. Set DATA_RETENTION_TIME_IN_DAYS at the database or schema level using ALTER DATABASE/SCHEMA — 90 days for CRITICAL, 14-30 days for STANDARD, and 1 day for TRANSIENT. 3. Override at the table level where specific tables within a schema require different retention (e.g., a compliance audit table in an otherwise STANDARD schema). 4. Use TRANSIENT tables (CREATE TRANSIENT TABLE) for staging data that does not require Fail-safe protection, eliminating the 7-day Fail-safe storage overhead. 5. Use TEMPORARY tables for session-scoped data that does not need to persist beyond the session. 6. Create a retention policy document mapping each database/schema to its retention tier with business justification. 7. Implement a weekly automated audit using a Snowflake task that queries INFORMATION_SCHEMA.TABLES to identify tables with retention settings that deviate from their schema's policy and alerts on non-compliance. 8. Monitor Time Travel storage consumption via ACCOUNT_USAGE.TABLE_STORAGE_METRICS — track time_travel_bytes and failsafe_bytes to ensure storage costs remain within budget.",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              }
            },
            {
              "id": "SFREL01_BP02",
              "title": "Establish and test data recovery procedures using Time Travel and UNDROP",
              "helpfulResource": {
                "displayText": "Time Travel enables three critical recovery operations: querying historical data using AT/BEFORE clauses, cloning objects at a point in time using CREATE ... CLONE ... AT, and restoring dropped objects using UNDROP TABLE/SCHEMA/DATABASE. Establishing documented, tested recovery procedures ensures that when data loss or corruption occurs, the team can execute recovery rapidly and confidently without improvising under pressure.",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              },
              "improvementPlan": {
                "displayText": "1. Document recovery runbooks for the three primary recovery scenarios: (a) accidental row deletion or update — use SELECT ... AT/BEFORE to identify the pre-change state, then INSERT/MERGE to restore affected rows; (b) accidental table or schema drop — use UNDROP TABLE/SCHEMA/DATABASE within the retention window; (c) data corruption from a bad ETL job — use CREATE TABLE ... CLONE ... BEFORE to create a point-in-time copy, validate the clone, then swap with ALTER TABLE ... SWAP WITH. 2. Define Recovery Time Objectives (RTOs) for each data criticality tier: CRITICAL < 1 hour, STANDARD < 4 hours, TRANSIENT — best effort. 3. Conduct quarterly recovery drills in a non-production environment: simulate each scenario, measure actual recovery time against RTO, and document gaps. 4. Ensure that roles responsible for recovery operations have the necessary privileges (e.g., UNDROP requires ownership or ACCOUNTADMIN). 5. Create a recovery decision tree that guides on-call engineers through scenario identification and the appropriate recovery method. 6. Track recovery events in an incident log — record the scenario, recovery method used, time to recover, and lessons learned. 7. Review and update runbooks after each real recovery event or drill.",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              }
            },
            {
              "id": "SFREL01_BP03",
              "title": "Implement proactive safeguards to prevent accidental data loss",
              "helpfulResource": {
                "displayText": "While Time Travel and Fail-safe provide recovery capabilities, proactive safeguards reduce the likelihood of needing them. Snowflake provides object-level protections such as preventing accidental drops, and operational practices like pre-change snapshots and access controls on destructive operations significantly reduce the risk of data loss events.",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              },
              "improvementPlan": {
                "displayText": "1. Restrict DROP and TRUNCATE privileges on production tables to a dedicated data_admin role — do not grant these to general analyst or ETL roles. 2. Implement a pre-change snapshot practice: before any bulk UPDATE, DELETE, or TRUNCATE on a critical table, create a zero-copy clone (CREATE TABLE ... CLONE) as a rollback point. 3. Use Snowflake's OBJECT_DEPENDENCIES function to identify downstream dependencies before dropping or altering objects. 4. Create a change management process for DDL operations on production databases: require peer review and approval before executing DROP, TRUNCATE, or ALTER TABLE operations that modify retention settings. 5. Configure Snowflake alerts (CREATE ALERT) to notify the data engineering team when DROP TABLE or TRUNCATE TABLE statements are executed against production schemas. 6. Audit destructive operations weekly by querying QUERY_HISTORY for DROP, TRUNCATE, and DELETE statements against production databases. 7. Document the safeguard procedures and ensure all team members with production access are trained on them.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              }
            },
            {
              "id": "SFREL01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFREL01_BP01 && SFREL01_BP02 && SFREL01_BP03",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFREL01_BP01 || SFREL01_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFREL02",
          "title": "How do you implement multi-region replication for critical Snowflake databases?",
          "description": "FR-16: Multi-region database replication in Snowflake creates secondary read-only copies of primary databases in geographically separate regions, enabling disaster recovery, reduced read latency for distributed teams, and business continuity during regional outages. Snowflake supports replication across all AWS, Azure, and GCP regions, with failover groups providing coordinated failover of databases and account objects.",
          "choices": [
            {
              "id": "SFREL02_BP01",
              "title": "Identify critical databases and configure cross-region replication using failover groups",
              "helpfulResource": {
                "displayText": "Snowflake failover groups enable coordinated replication and failover of databases, account parameters, network policies, and other account objects across regions and cloud platforms. Identify databases that support critical business operations (production analytics, customer-facing applications, regulatory reporting) and configure them within failover groups that replicate to a designated secondary region. Business Critical edition or higher is required for failover/failback and Client Redirect capabilities.",
                "url": "https://docs.snowflake.com/en/user-guide/replication-failover"
              },
              "improvementPlan": {
                "displayText": "1. Classify all databases by business criticality and Recovery Point Objective (RPO): CRITICAL databases (RPO < 1 hour) that support production analytics, customer-facing applications, or regulatory reporting; IMPORTANT databases (RPO < 4 hours) that support internal operations; NON-CRITICAL databases that do not require replication. 2. Ensure your Snowflake account is Business Critical edition or higher, which is required for failover groups and Client Redirect. 3. Enable replication for each target account using ALTER ACCOUNT ... ENABLE REPLICATION TO ACCOUNTS. 4. Create a failover group in the primary account using CREATE FAILOVER GROUP that includes all CRITICAL databases, relevant account objects (network policies, warehouse configurations), and the target secondary account. 5. Create a secondary failover group in the target account using CREATE FAILOVER GROUP ... AS REPLICA OF to establish the replication relationship. 6. Configure replication refresh schedules using ALTER FAILOVER GROUP ... SET REPLICATION_SCHEDULE — set CRITICAL databases to refresh every 10-30 minutes and IMPORTANT databases to refresh every 1-4 hours. 7. Document the replication topology including primary region, secondary region(s), failover group membership, and refresh schedules.",
                "url": "https://docs.snowflake.com/en/user-guide/account-replication-config"
              }
            },
            {
              "id": "SFREL02_BP02",
              "title": "Monitor replication health and lag to ensure RPO compliance",
              "helpfulResource": {
                "displayText": "Replication lag — the time difference between the primary and secondary database — directly determines your actual RPO. Monitoring replication health ensures that secondary databases remain synchronized within acceptable thresholds and that replication failures are detected and remediated before they impact disaster recovery readiness. Use REPLICATION_GROUP_REFRESH_HISTORY and REPLICATION_GROUP_USAGE_HISTORY views to track replication status, lag, and data transfer costs.",
                "url": "https://docs.snowflake.com/en/user-guide/account-replication-manage"
              },
              "improvementPlan": {
                "displayText": "1. Query ACCOUNT_USAGE.REPLICATION_GROUP_REFRESH_HISTORY regularly to monitor refresh status (SUCCESS, FAILED, CANCELED), refresh duration, and bytes transferred per refresh cycle. 2. Calculate replication lag by comparing the latest successful refresh timestamp against the current time — alert if lag exceeds the defined RPO threshold for each criticality tier. 3. Create Snowflake alerts (CREATE ALERT) that trigger when: (a) a replication refresh fails, (b) replication lag exceeds RPO threshold, or (c) no successful refresh has occurred within 2x the scheduled interval. 4. Monitor replication costs using REPLICATION_GROUP_USAGE_HISTORY — track data transfer bytes and associated credits to ensure replication costs remain within budget. 5. Build a replication health dashboard showing: current lag per failover group, refresh success/failure rates, data transfer volume trends, and RPO compliance percentage. 6. Investigate and resolve replication failures promptly — common causes include network issues, storage limits in the target account, and schema changes that conflict with replication. 7. Conduct monthly replication health reviews with stakeholders to confirm RPO targets remain appropriate and replication is performing within SLA.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/replication_group_refresh_history"
              }
            },
            {
              "id": "SFREL02_BP03",
              "title": "Establish and test failover/failback procedures with Client Redirect",
              "helpfulResource": {
                "displayText": "Snowflake Client Redirect provides a connection URL that automatically redirects client connections to the active account during a failover event, eliminating the need for application-side connection string changes. Establishing and regularly testing failover and failback procedures ensures that when a regional outage occurs, the team can execute a coordinated failover within the defined RTO and subsequently fail back to the primary region when it recovers.",
                "url": "https://docs.snowflake.com/en/user-guide/client-redirect"
              },
              "improvementPlan": {
                "displayText": "1. Configure Client Redirect by setting up a connection URL that supports automatic redirection between primary and secondary accounts — this requires Business Critical edition. 2. Update all application connection strings, ETL tools, and BI platforms to use the Client Redirect URL instead of direct account URLs. 3. Document the failover procedure: (a) confirm the outage is regional and not transient, (b) verify secondary database freshness by checking the last successful replication refresh, (c) execute ALTER FAILOVER GROUP ... PRIMARY to promote the secondary account, (d) validate that Client Redirect is routing connections to the new primary, (e) confirm application connectivity and data accessibility. 4. Document the failback procedure: (a) once the original primary region recovers, reverse replication to synchronize changes made during the failover period, (b) execute ALTER FAILOVER GROUP ... PRIMARY to restore the original primary, (c) validate Client Redirect routing and application connectivity. 5. Define RTO targets: CRITICAL workloads < 1 hour, IMPORTANT workloads < 4 hours. 6. Conduct semi-annual failover drills in a non-production environment — execute the full failover and failback procedure, measure actual RTO, and document gaps. 7. Maintain a failover decision matrix that defines who can authorize a failover, under what conditions, and the communication plan for stakeholders during an event.",
                "url": "https://docs.snowflake.com/en/user-guide/replication-failover"
              }
            },
            {
              "id": "SFREL02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFREL02_BP01 && SFREL02_BP02 && SFREL02_BP03",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFREL02_BP01 || SFREL02_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFREL03",
          "title": "How do you implement virtual warehouse failover and query retry patterns?",
          "description": "FR-17: Virtual warehouse availability directly impacts query execution reliability. Snowflake's architecture provides built-in fault tolerance through automated query retries on transient failures, but organizations must also design for warehouse-level resilience through multi-cluster configurations, workload isolation, and application-level retry logic to ensure that critical workloads continue executing even during infrastructure disruptions or warehouse resource contention.",
          "choices": [
            {
              "id": "SFREL03_BP01",
              "title": "Configure multi-cluster warehouses and workload isolation for warehouse-level resilience",
              "helpfulResource": {
                "displayText": "Multi-cluster warehouses automatically scale out additional compute clusters when query concurrency exceeds the capacity of a single cluster, preventing query queuing and timeouts during demand spikes. Workload isolation — dedicating separate warehouses to different workload classes (ETL, BI, ad-hoc) — ensures that a resource-intensive workload in one class cannot starve or disrupt another. Together, these patterns provide warehouse-level resilience against both demand spikes and workload interference.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-multicluster"
              },
              "improvementPlan": {
                "displayText": "1. Identify workload classes and assign each to a dedicated warehouse: ETL/batch processing, interactive BI/dashboards, ad-hoc analyst queries, and application/API queries. 2. Configure multi-cluster scaling for workloads with variable concurrency: set MIN_CLUSTER_COUNT = 1 and MAX_CLUSTER_COUNT based on peak concurrency analysis from QUERY_HISTORY. 3. Set the scaling policy to STANDARD for latency-sensitive workloads (BI, application queries) to scale out immediately when queries queue, and ECONOMY for cost-sensitive workloads (batch ETL) that can tolerate brief queuing. 4. Configure AUTO_SUSPEND aggressively (60 seconds for most workloads) to release idle clusters and reduce costs. 5. Ensure AUTO_RESUME = TRUE on all warehouses so queries are never blocked by a suspended warehouse. 6. Monitor QUEUED_OVERLOAD_TIME and QUEUED_PROVISIONING_TIME in QUERY_HISTORY to detect when warehouses are under-provisioned or scaling too slowly. 7. Review warehouse isolation boundaries quarterly — as workload patterns evolve, some warehouses may need to be split further or consolidated.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-multicluster"
              }
            },
            {
              "id": "SFREL03_BP02",
              "title": "Implement application-level query retry logic with exponential backoff",
              "helpfulResource": {
                "displayText": "Snowflake automatically retries queries that fail due to transient internal errors, but application-level retry logic is essential for handling connection timeouts, warehouse suspension/resume delays, and transient network issues between the client and Snowflake. Implementing retry logic with exponential backoff and jitter in your application or ETL framework ensures that transient failures are recovered automatically without manual intervention or data pipeline disruptions.",
                "url": "https://docs.snowflake.com/en/developer-guide/snowflake-scripting/exceptions"
              },
              "improvementPlan": {
                "displayText": "1. Implement retry logic in all application and ETL code that connects to Snowflake — use exponential backoff starting at 1 second with a maximum of 5 retries and a cap of 60 seconds between retries. 2. Add jitter (randomized delay) to retry intervals to prevent thundering herd effects when multiple clients retry simultaneously after a transient outage. 3. Classify errors as retryable vs. non-retryable: retryable errors include connection timeouts, HTTP 503 (service unavailable), warehouse resume delays, and Snowflake internal errors; non-retryable errors include syntax errors, permission denied, and resource limit exceeded. 4. For Snowflake connectors (Python, JDBC, ODBC, Spark), configure the built-in retry parameters: connection_timeout, network_timeout, and max_retry_count. 5. Implement idempotent query patterns for write operations — use MERGE instead of INSERT to ensure that retried write queries do not create duplicate data. 6. Log all retry events with the error code, retry attempt number, and final outcome for operational visibility. 7. Monitor retry rates as a leading indicator of infrastructure health — a spike in retries may indicate an emerging issue before it causes visible failures. 8. Set alerting thresholds on retry rates: alert if retry rate exceeds 5% of total queries in any 15-minute window.",
                "url": "https://docs.snowflake.com/en/developer-guide/snowflake-scripting/exceptions"
              }
            },
            {
              "id": "SFREL03_BP03",
              "title": "Establish warehouse health monitoring and automated incident response",
              "helpfulResource": {
                "displayText": "Proactive warehouse health monitoring detects performance degradation, resource contention, and availability issues before they impact end users. Combining Snowflake's ACCOUNT_USAGE views with automated alerting and incident response procedures ensures that warehouse-level issues are identified and resolved rapidly, maintaining query execution reliability across all workloads.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/warehouse_metering_history"
              },
              "improvementPlan": {
                "displayText": "1. Create a warehouse health monitoring dashboard that tracks: query success/failure rates per warehouse, average and P95 query execution times, queue depth (QUEUED_OVERLOAD_TIME > 0), warehouse utilization (active time vs. idle time), and multi-cluster scaling events. 2. Configure Snowflake alerts for critical warehouse health conditions: (a) query failure rate exceeds 5% in a 15-minute window, (b) P95 query execution time exceeds 2x the trailing 7-day average, (c) warehouse queue depth exceeds 10 concurrent queries for more than 5 minutes, (d) warehouse fails to resume within 30 seconds. 3. Route alerts to your incident management platform (PagerDuty, SNS, Slack) with appropriate severity levels. 4. Create incident response runbooks for common warehouse issues: warehouse stuck in provisioning state (resolution: suspend and resume, or create a replacement warehouse), sustained query queuing (resolution: scale up warehouse size or increase MAX_CLUSTER_COUNT), query failures due to resource exhaustion (resolution: identify and terminate runaway queries using SYSTEM$CANCEL_ALL_QUERIES). 5. Implement a circuit breaker pattern for application queries: if a warehouse is consistently failing or timing out, redirect queries to a backup warehouse or return a graceful degradation response. 6. Track warehouse availability as an SLI (Service Level Indicator): percentage of time each warehouse is available and responsive, with a target SLO of 99.9%. 7. Review warehouse health metrics in weekly operational reviews and adjust configurations based on trends.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              }
            },
            {
              "id": "SFREL03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFREL03_BP01 && SFREL03_BP02 && SFREL03_BP03",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFREL03_BP01 || SFREL03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFREL04",
          "title": "How do you leverage Snowflake Fail-Safe for disaster recovery beyond Time Travel?",
          "description": "Fail-Safe provides an additional 7-day recovery window after Time Travel expires, managed by Snowflake Support. It is the last line of defense against data loss due to accidental or malicious deletion. Understanding its scope and limitations is essential for DR planning.",
          "choices": [
            {
              "id": "SFREL04_BP01",
              "title": "Document Fail-Safe coverage and incorporate it into your DR runbook",
              "helpfulResource": {
                "displayText": "Fail-Safe automatically protects permanent tables for 7 days after Time Travel expires — no configuration required. However, it is only accessible via Snowflake Support (not SQL). Transient and temporary tables have no Fail-Safe. Document which tables have Fail-Safe and set your RTO/RPO expectations accordingly.",
                "url": "https://docs.snowflake.com/en/user-guide/data-failsafe"
              },
              "improvementPlan": {
                "displayText": "Audit table types: SELECT table_name, table_type, retention_time, failsafe_period_in_days FROM INFORMATION_SCHEMA.TABLES. Ensure critical tables are PERMANENT (not TRANSIENT). Add a DR runbook entry: 'For data loss beyond Time Travel window, contact Snowflake Support with account ID and affected table name within 7 days.'",
                "url": "https://docs.snowflake.com/en/user-guide/data-failsafe"
              }
            },
            {
              "id": "SFREL04_BP02",
              "title": "Use zero-copy clones as point-in-time snapshots for additional recovery flexibility",
              "helpfulResource": {
                "displayText": "Zero-copy clones create an instant, storage-efficient snapshot of a table or database at a specific point in time. Unlike Time Travel, clones persist indefinitely and can be queried or restored at any time. Use clones before major schema changes, bulk loads, or destructive operations.",
                "url": "https://docs.snowflake.com/en/user-guide/object-clone"
              },
              "improvementPlan": {
                "displayText": "Before any high-risk operation: CREATE TABLE orders_backup_20240115 CLONE orders; or CREATE DATABASE prod_backup_20240115 CLONE prod_db; Schedule weekly clones of critical tables as a cheap insurance policy. Clones add storage cost only for rows that diverge after cloning.",
                "url": "https://docs.snowflake.com/en/user-guide/object-clone"
              }
            },
            {
              "id": "SFREL04_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFREL04_BP01 && SFREL04_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFREL04_BP01 || SFREL04_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFREL05",
          "title": "How do you ensure reliability and observability for Snowflake Tasks and Streams?",
          "description": "Tasks and Streams are foundational for CDC and incremental processing pipelines. Without monitoring, silent failures leave Streams unconsumed (staleness risk) and Tasks stuck in error states, causing data freshness SLA breaches.",
          "choices": [
            {
              "id": "SFREL05_BP01",
              "title": "Monitor Task execution health via TASK_HISTORY and configure error alerting",
              "helpfulResource": {
                "displayText": "Query ACCOUNT_USAGE.TASK_HISTORY for failed or long-running tasks. Create Snowflake ALERT objects that fire when a task transitions to FAILED state. For task graphs, ensure the root task has an error integration configured so failures propagate to your incident management system.",
                "url": "https://docs.snowflake.com/en/user-guide/tasks-intro"
              },
              "improvementPlan": {
                "displayText": "Set up a monitoring alert: CREATE ALERT task_failure_alert WAREHOUSE=monitor_wh SCHEDULE='5 MINUTE' IF(EXISTS(SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.TASK_HISTORY WHERE STATE='FAILED' AND SCHEDULED_TIME > DATEADD('MINUTE',-5,CURRENT_TIMESTAMP()))) THEN CALL SYSTEM$SEND_EMAIL(...); Also track SYSTEM$GET_TASK_GRAPH_RUN_STATUS() for complex graphs.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/task_history"
              }
            },
            {
              "id": "SFREL05_BP02",
              "title": "Monitor Stream staleness and define a maximum acceptable Stream offset lag",
              "helpfulResource": {
                "displayText": "Streams become stale if not consumed before their staleness period expires (based on Time Travel retention). A stale Stream loses its offset and cannot be queried until reset. Monitor STALE_AFTER in SHOW STREAMS and alert when a Stream's offset is not advanced within SLA.",
                "url": "https://docs.snowflake.com/en/user-guide/streams-intro"
              },
              "improvementPlan": {
                "displayText": "Query: SHOW STREAMS; filter on STALE=TRUE or STALE_AFTER < DATEADD('hour',4,CURRENT_TIMESTAMP()). Create a Snowflake ALERT that fires when any production Stream is within 4 hours of staling. Establish a consumer task or procedure that advances the Stream offset on schedule.",
                "url": "https://docs.snowflake.com/en/user-guide/streams-intro"
              }
            },
            {
              "id": "SFREL05_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFREL05_BP01 && SFREL05_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFREL05_BP01 || SFREL05_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFREL06",
          "title": "How do you manage schema changes in Snowflake without disrupting downstream pipelines?",
          "description": "Schema changes (column additions, type changes, renames) can silently break downstream consumers  -  dashboards, ML models, and pipelines that depend on specific column names or types. Schema governance prevents uncoordinated changes.",
          "choices": [
            {
              "id": "SFREL06_BP01",
              "title": "Enable schema evolution on Snowpipe Streaming tables and use MATCH_BY_COLUMN_NAME for safe ingestion",
              "helpfulResource": {
                "displayText": "Enable ENABLE_SCHEMA_EVOLUTION=TRUE on tables receiving Snowpipe Streaming or COPY INTO loads. Combined with MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE, new source fields auto-add columns rather than failing. This allows producers to add fields without coordinating schema changes with consumers.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-schema-evolution"
              },
              "improvementPlan": {
                "displayText": "ALTER TABLE <table> SET ENABLE_SCHEMA_EVOLUTION=TRUE; GRANT EVOLVE SCHEMA ON TABLE <table> TO ROLE <ingest_role>; Verify in ACCOUNT_USAGE.QUERY_HISTORY that schema evolution events appear as ALTER TABLE statements. Test by sending a record with a new field and confirming DESCRIBE TABLE shows the new column.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-schema-evolution"
              }
            },
            {
              "id": "SFREL06_BP02",
              "title": "Use DCM or dbt for version-controlled, reviewed schema migrations on managed tables",
              "helpfulResource": {
                "displayText": "For tables with established consumers, use Infrastructure-as-Code (DCM, dbt, Terraform Snowflake provider) to manage schema changes through a PR review process. This ensures all stakeholders review column additions, renames, or type changes before they reach production.",
                "url": "https://docs.snowflake.com/en/developer-guide/native-apps/dcm/about-dcm"
              },
              "improvementPlan": {
                "displayText": "Store table DDL in Git under dcm/databases/<db>/schemas/<schema>/tables/. Submit changes via PR. Use `dcm diff` to preview impact before `dcm deploy`. Tag schema change PRs with affected downstream systems (dashboards, ML models, APIs) so owners can review and test before merge.",
                "url": "https://docs.snowflake.com/en/developer-guide/native-apps/dcm/about-dcm"
              }
            },
            {
              "id": "SFREL06_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFREL06_BP01 && SFREL06_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFREL06_BP01 || SFREL06_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    },
    {
      "id": "performanceEfficiency",
      "name": "Performance Efficiency",
      "questions": [
        {
          "id": "SFPERF01",
          "title": "How do you optimize virtual warehouse sizing, auto-suspend, and auto-resume for performance efficiency?",
          "description": "FR-18: Virtual warehouse sizing directly impacts query performance, concurrency, and cost. Snowflake's elastic compute model allows warehouses to be sized from X-Small to 6X-Large, with auto-suspend and auto-resume features that balance availability with cost efficiency. Proper warehouse sizing ensures queries execute within acceptable latency while avoiding over-provisioning that wastes credits.",
          "choices": [
            {
              "id": "SFPERF01_BP01",
              "title": "Right-size warehouses based on workload profiling and iterative benchmarking",
              "helpfulResource": {
                "displayText": "Snowflake warehouses double in compute resources with each size increment (Small = 2 credits/hour, Medium = 4, Large = 8, etc.). Right-sizing requires profiling each workload's query complexity, data volume, and concurrency patterns using QUERY_HISTORY, then iteratively testing warehouse sizes to find the optimal balance between execution time and credit consumption. Snowflake's per-second billing model means larger warehouses that complete queries faster can sometimes cost the same as smaller warehouses that run longer.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-overview"
              },
              "improvementPlan": {
                "displayText": "1. Profile each warehouse's workload by querying ACCOUNT_USAGE.QUERY_HISTORY: calculate average and P95 execution times, bytes scanned, rows produced, and QUEUED_OVERLOAD_TIME per warehouse over the past 30 days. 2. Categorize workloads by type: interactive BI (latency-sensitive, target < 10 seconds), batch ETL (throughput-sensitive, target completion within SLA window), ad-hoc analytics (moderate latency tolerance), and application/API queries (strict latency SLAs). 3. For each warehouse, run a sizing benchmark: execute representative queries at the current size and one size up and one size down, comparing execution time and total credits consumed. 4. Apply the cost-efficiency rule: if doubling warehouse size cuts execution time by more than half, the larger size is more cost-efficient due to per-second billing. 5. For warehouses with variable workload complexity, consider using multi-cluster scaling (horizontal) rather than upsizing (vertical) to handle concurrency spikes without over-provisioning for simple queries. 6. Document the sizing rationale for each warehouse including workload profile, benchmark results, and chosen size. 7. Conduct quarterly right-sizing reviews by re-running the profiling analysis to detect workload drift.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-overview"
              }
            },
            {
              "id": "SFPERF01_BP02",
              "title": "Configure auto-suspend and auto-resume to balance availability with cost efficiency",
              "helpfulResource": {
                "displayText": "Auto-suspend stops a warehouse after a configurable period of inactivity, eliminating idle credit consumption. Auto-resume automatically restarts the warehouse when a new query arrives, typically within 1-2 seconds. The optimal auto-suspend timeout depends on the workload pattern: aggressive timeouts (60 seconds) maximize cost savings for sporadic workloads, while longer timeouts (300 seconds) preserve the warehouse's local SSD cache for interactive workloads that benefit from warm cache performance.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-considerations"
              },
              "improvementPlan": {
                "displayText": "1. Audit all warehouses for current AUTO_SUSPEND and AUTO_RESUME settings using SHOW WAREHOUSES — identify warehouses with AUTO_SUSPEND disabled or set to excessively long timeouts (> 600 seconds). 2. Set AUTO_SUSPEND to 60 seconds for batch ETL warehouses, ad-hoc query warehouses, and any warehouse with sporadic usage patterns. 3. Set AUTO_SUSPEND to 300 seconds for interactive BI warehouses where users issue queries in rapid succession and benefit from warm SSD cache. 4. For scheduled batch workloads managed by orchestration tools (Airflow, dbt, Snowflake Tasks), consider setting AUTO_SUSPEND to 0 (immediate) since the orchestrator controls warehouse lifecycle. 5. Ensure AUTO_RESUME = TRUE on all warehouses — a suspended warehouse with AUTO_RESUME disabled will block queries until manually resumed. 6. Monitor the impact of auto-suspend settings by comparing WAREHOUSE_METERING_HISTORY credit consumption before and after changes. 7. Establish a governance policy: all new warehouses must be created with AUTO_SUSPEND <= 120 seconds and AUTO_RESUME = TRUE, with exceptions requiring documented justification.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-considerations"
              }
            },
            {
              "id": "SFPERF01_BP03",
              "title": "Implement workload isolation and multi-cluster scaling for concurrency management",
              "helpfulResource": {
                "displayText": "Workload isolation assigns different workload classes to dedicated warehouses, preventing resource contention between ETL, BI, and ad-hoc workloads. Multi-cluster warehouses automatically scale out additional compute clusters when query concurrency exceeds single-cluster capacity, preventing query queuing. Together, these patterns ensure that each workload class receives predictable performance regardless of activity in other workload classes.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-multicluster"
              },
              "improvementPlan": {
                "displayText": "1. Identify distinct workload classes and create dedicated warehouses: ETL/batch processing, interactive BI/dashboards, ad-hoc analyst queries, application/API queries, and data science/ML workloads. 2. Size each warehouse independently based on its workload profile — ETL warehouses may need larger sizes for complex transformations, while BI warehouses may need smaller sizes with multi-cluster scaling for high concurrency. 3. Configure multi-cluster scaling for concurrency-sensitive workloads: set MIN_CLUSTER_COUNT = 1 and MAX_CLUSTER_COUNT based on peak concurrency analysis (query QUERY_HISTORY for maximum concurrent queries per 5-minute window). 4. Set scaling policy to STANDARD for latency-sensitive workloads (BI, application) to scale out immediately when queries queue, and ECONOMY for cost-sensitive workloads (batch ETL) that can tolerate brief queuing. 5. Monitor QUEUED_OVERLOAD_TIME and QUEUED_PROVISIONING_TIME in QUERY_HISTORY to detect when warehouses are under-provisioned or scaling too slowly. 6. Implement warehouse assignment enforcement using Snowflake roles — grant USAGE on each warehouse only to the roles associated with its workload class. 7. Review workload isolation boundaries quarterly as workload patterns evolve.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-multicluster"
              }
            },
            {
              "id": "SFPERF01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFPERF01_BP01 && SFPERF01_BP02 && SFPERF01_BP03",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFPERF01_BP01 || SFPERF01_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFPERF02",
          "title": "How do you optimize clustering keys and micro-partition pruning for query performance?",
          "description": "FR-19: Snowflake stores data in compressed, columnar micro-partitions (50-500MB uncompressed each) and maintains metadata about min/max values, distinct counts, and null counts for each column in every micro-partition. Query performance depends heavily on partition pruning  -  Snowflake's ability to skip micro-partitions that do not contain relevant data based on query predicates. Clustering keys allow you to control how data is organized across micro-partitions, enabling more effective pruning for frequently filtered columns.",
          "choices": [
            {
              "id": "SFPERF02_BP01",
              "title": "Identify clustering key candidates using query pattern analysis and clustering depth metrics",
              "helpfulResource": {
                "displayText": "Clustering keys should be chosen based on the columns most frequently used in WHERE clauses, JOIN conditions, and ORDER BY clauses of your most critical and frequent queries. Snowflake's SYSTEM$CLUSTERING_INFORMATION function provides clustering depth and overlap metrics that quantify how well a table's data is organized for a given set of columns. Tables with high clustering depth (> 4) on frequently filtered columns are strong candidates for explicit clustering keys.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-clustering-keys"
              },
              "improvementPlan": {
                "displayText": "1. Identify the top 20 most frequently executed and most resource-intensive queries per table by analyzing QUERY_HISTORY — focus on queries with high BYTES_SCANNED relative to ROWS_PRODUCED (indicating poor pruning). 2. Extract the most common filter columns from these queries' WHERE clauses and JOIN conditions. 3. For each candidate table, run SYSTEM$CLUSTERING_INFORMATION(table_name, '(column1, column2)') to assess current clustering quality — a high average_depth (> 4) and high average_overlap indicate poor clustering on those columns. 4. Select clustering keys with 2-4 columns maximum, ordered from lowest cardinality to highest (e.g., date column first, then region, then customer_id). 5. Avoid clustering on high-cardinality columns alone (e.g., UUID primary keys) as they provide minimal pruning benefit. 6. For tables with multiple distinct query patterns, evaluate whether a single clustering key serves the majority of queries or whether table restructuring (e.g., materialized views) is needed. 7. Document the clustering key selection rationale including the query patterns analyzed, clustering metrics before/after, and expected pruning improvement.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-clustering-keys"
              }
            },
            {
              "id": "SFPERF02_BP02",
              "title": "Implement and monitor Automatic Clustering for large, frequently queried tables",
              "helpfulResource": {
                "displayText": "Snowflake's Automatic Clustering service continuously reorganizes micro-partitions in the background to maintain optimal clustering as new data is loaded. Once a clustering key is defined using ALTER TABLE ... CLUSTER BY, Automatic Clustering activates and maintains the clustering over time. Monitoring clustering health ensures that the service is keeping pace with data ingestion and that clustering costs remain proportional to the performance benefit.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-auto-reclustering"
              },
              "improvementPlan": {
                "displayText": "1. Apply clustering keys only to tables that meet all three criteria: (a) table size exceeds 1 TB or contains more than 1 billion rows, (b) queries frequently filter on specific columns, and (c) SYSTEM$CLUSTERING_INFORMATION shows poor clustering depth on those columns. 2. Define the clustering key using ALTER TABLE ... CLUSTER BY (column1, column2) — Automatic Clustering will begin reorganizing micro-partitions in the background. 3. Monitor clustering progress using SYSTEM$CLUSTERING_INFORMATION — track average_depth and average_overlap over time to confirm improvement. 4. Monitor Automatic Clustering costs using ACCOUNT_USAGE.AUTOMATIC_CLUSTERING_HISTORY — track credits consumed per table per day. 5. Evaluate the cost-benefit ratio: compare the credits spent on Automatic Clustering against the credits saved from reduced query execution time (fewer bytes scanned, faster completion). 6. For tables with very high data ingestion rates, monitor whether Automatic Clustering can keep pace — if clustering depth degrades during peak ingestion, consider adjusting ingestion patterns or pre-sorting data before loading. 7. Review clustering key effectiveness quarterly by re-analyzing query patterns — if query patterns shift, the clustering key may need to be updated.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-auto-reclustering"
              }
            },
            {
              "id": "SFPERF02_BP03",
              "title": "Optimize micro-partition pruning through query design and data modeling best practices",
              "helpfulResource": {
                "displayText": "Even with optimal clustering keys, query design and data modeling choices significantly impact partition pruning effectiveness. Writing pruning-friendly queries, using appropriate data types, and structuring tables to align with access patterns ensures that Snowflake's query optimizer can eliminate the maximum number of micro-partitions before scanning data.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-clustering-micropartitions"
              },
              "improvementPlan": {
                "displayText": "1. Write pruning-friendly queries: use explicit range predicates on clustering key columns (e.g., WHERE date BETWEEN '2025-01-01' AND '2025-03-31') rather than functions that prevent pruning (e.g., WHERE YEAR(date) = 2025). 2. Use the QUERY_PROFILE to verify partition pruning effectiveness — check the 'Partitions scanned' vs. 'Partitions total' ratio for TableScan operators; a high ratio indicates poor pruning. 3. Ensure date/timestamp columns used in filters are stored as native DATE or TIMESTAMP types, not VARCHAR — Snowflake can only prune on native types. 4. For tables with both date-based and category-based access patterns, consider creating materialized views with different clustering optimized for each pattern. 5. Use search optimization service (ALTER TABLE ... ADD SEARCH OPTIMIZATION) for tables with equality predicates on high-cardinality columns (e.g., WHERE user_id = 'abc123') where clustering is not effective. 6. Monitor pruning efficiency across your top queries monthly using QUERY_PROFILE data — set a target of > 80% partition pruning for critical queries. 7. Educate analysts and data engineers on pruning-friendly query patterns through documentation and code review guidelines.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-clustering-micropartitions"
              }
            },
            {
              "id": "SFPERF02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFPERF02_BP01 && SFPERF02_BP02 && SFPERF02_BP03",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFPERF02_BP01 || SFPERF02_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFPERF03",
          "title": "How do you use query performance diagnostics and QUERY_HISTORY for continuous optimization?",
          "description": "FR-20: Snowflake's ACCOUNT_USAGE.QUERY_HISTORY view and QUERY_PROFILE provide detailed execution metrics for every query  -  execution time, bytes scanned, rows produced, compilation time, queuing time, and operator-level statistics. Systematic use of these diagnostics enables continuous identification and resolution of performance bottlenecks, query regressions, and optimization opportunities across your Snowflake workloads.",
          "choices": [
            {
              "id": "SFPERF03_BP01",
              "title": "Establish systematic query performance monitoring using QUERY_HISTORY views",
              "helpfulResource": {
                "displayText": "ACCOUNT_USAGE.QUERY_HISTORY retains 365 days of query execution data including execution time, bytes scanned, rows produced, compilation time, queuing time, warehouse name, and error codes. Building systematic monitoring on this data enables proactive identification of slow queries, query regressions, and workload patterns that impact performance. Key metrics to track include P50/P95/P99 execution times, bytes scanned per query, and the ratio of queuing time to execution time.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              },
              "improvementPlan": {
                "displayText": "1. Create a scheduled Snowflake task that runs daily to compute key performance metrics from QUERY_HISTORY: P50, P95, and P99 execution times by warehouse and query type, average bytes scanned per query, total queuing time, and query failure rates. 2. Build a query performance dashboard that visualizes: daily execution time trends (P50/P95/P99), top 20 slowest queries by average execution time, top 20 most expensive queries by bytes scanned, queries with execution time regression (> 2x increase vs. trailing 7-day average), and warehouse queuing time trends. 3. Create Snowflake alerts for critical performance conditions: (a) P95 execution time exceeds 2x the trailing 7-day average for any warehouse, (b) any single query runs longer than a defined threshold (e.g., 30 minutes), (c) query failure rate exceeds 5% in any 15-minute window. 4. Implement a weekly query performance review process: review the top 10 slowest and most expensive queries, identify optimization opportunities, and assign remediation actions. 5. Track query performance SLIs: percentage of queries completing within SLA (e.g., < 10 seconds for BI, < 5 minutes for ETL), and set improvement targets. 6. Archive QUERY_HISTORY data beyond 365 days to S3 for long-term trend analysis.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              }
            },
            {
              "id": "SFPERF03_BP02",
              "title": "Use QUERY_PROFILE for deep-dive diagnostics on slow and expensive queries",
              "helpfulResource": {
                "displayText": "The QUERY_PROFILE provides operator-level execution statistics for individual queries — showing the execution plan as a DAG of operators (TableScan, Filter, Join, Aggregate, Sort) with metrics for each: rows processed, bytes scanned, partition pruning ratios, spilling to local/remote storage, and time spent. This granular visibility enables precise identification of performance bottlenecks such as full table scans, inefficient joins, excessive spilling, and poor partition pruning.",
                "url": "https://docs.snowflake.com/en/user-guide/ui-query-profile"
              },
              "improvementPlan": {
                "displayText": "1. For every query identified as slow or expensive in the weekly performance review, analyze its QUERY_PROFILE to identify the bottleneck operator. 2. Check partition pruning: if 'Partitions scanned' is close to 'Partitions total' on TableScan operators, the query is not pruning effectively — investigate clustering keys or query predicate design. 3. Check for spilling: if any operator shows 'Bytes spilled to local storage' or 'Bytes spilled to remote storage', the warehouse is undersized for that query — consider upsizing the warehouse or optimizing the query to reduce intermediate result sizes. 4. Check join efficiency: if a JoinFilter or Join operator processes significantly more rows than expected, investigate join key cardinality, data skew, and whether a broadcast join is being used inappropriately for large tables. 5. Check for excessive sorting: if Sort operators consume a large percentage of execution time, evaluate whether the ORDER BY is necessary or whether pre-sorted data (via clustering) could eliminate the sort. 6. Create a query optimization playbook documenting common QUERY_PROFILE patterns and their resolutions. 7. Train data engineers and analysts on reading QUERY_PROFILE output and applying optimization techniques.",
                "url": "https://docs.snowflake.com/en/user-guide/ui-query-profile"
              }
            },
            {
              "id": "SFPERF03_BP03",
              "title": "Implement query regression detection and continuous performance optimization processes",
              "helpfulResource": {
                "displayText": "Query regressions — queries that suddenly take longer to execute due to data growth, schema changes, or Snowflake service updates — can silently degrade user experience and increase costs. Implementing automated regression detection using QUERY_HISTORY data ensures that performance degradations are caught early and remediated before they impact SLAs.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              },
              "improvementPlan": {
                "displayText": "1. Implement automated query regression detection: create a Snowflake task that compares each query's average execution time over the past 24 hours against its trailing 7-day average — flag queries with > 2x increase as regressions. 2. Use QUERY_TAG or QUERY_HASH to group related queries and track their performance over time, even when literal values change. 3. Create a regression alert that notifies the data engineering team when a critical query (tagged as SLA-bound) regresses beyond its defined threshold. 4. Establish a query optimization backlog: maintain a prioritized list of queries to optimize based on business impact (SLA criticality), resource consumption (bytes scanned, credits consumed), and frequency of execution. 5. Implement a query change management process: when ETL logic or data models change, benchmark query performance before and after the change to catch regressions at deployment time. 6. Track optimization outcomes: for each query optimized, record the before/after execution time, bytes scanned, and credits consumed to quantify the improvement. 7. Conduct monthly performance optimization retrospectives to review optimization outcomes, update the playbook, and identify systemic patterns that could be addressed through infrastructure changes (clustering, materialized views, warehouse sizing).",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              }
            },
            {
              "id": "SFPERF03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFPERF03_BP01 && SFPERF03_BP02 && SFPERF03_BP03",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFPERF03_BP01 || SFPERF03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    },
    {
      "id": "costOptimization",
      "name": "Cost Optimization",
      "questions": [
        {
          "id": "SFCOST01",
          "title": "How do you set credit quotas and enforce spend limits on Snowflake warehouses?",
          "description": "Resource monitors allow you to define credit quotas at the account or warehouse level and take automated actions (notify, suspend) when thresholds are crossed. Without them, runaway queries or misconfigured warehouses can exhaust credits silently.",
          "choices": [
            {
              "id": "SFCOST01_BP01",
              "title": "Create warehouse-level resource monitors with multi-tier thresholds",
              "helpfulResource": {
                "displayText": "Assign a RESOURCE MONITOR to every production warehouse. Configure thresholds at 75%, 90%, and 100% of monthly quota. Use NOTIFY action at lower thresholds and SUSPEND or SUSPEND_IMMEDIATE at 100% for non-critical warehouses to prevent overspend.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              },
              "improvementPlan": {
                "displayText": "Run: CREATE RESOURCE MONITOR <name> CREDIT_QUOTA=<n> TRIGGERS ON 75 PERCENT DO NOTIFY ON 100 PERCENT DO SUSPEND; ALTER WAREHOUSE <wh> SET RESOURCE_MONITOR=<name>. Review and adjust quotas monthly using ACCOUNT_USAGE.WAREHOUSE_METERING_HISTORY.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              }
            },
            {
              "id": "SFCOST01_BP02",
              "title": "Configure an account-level resource monitor as a catch-all safety net",
              "helpfulResource": {
                "displayText": "An account-level monitor caps total credit consumption across all warehouses and serverless features. Set it 10–20% above expected monthly spend as a hard ceiling. This catches usage from ad-hoc warehouses, Snowpipe, Tasks, and other serverless services that may not have individual monitors.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              },
              "improvementPlan": {
                "displayText": "Run: CREATE RESOURCE MONITOR account_guard CREDIT_QUOTA=<n> FREQUENCY=MONTHLY START_TIMESTAMP=IMMEDIATELY TRIGGERS ON 90 PERCENT DO NOTIFY ON 100 PERCENT DO SUSPEND_IMMEDIATE. Assign to account: ALTER ACCOUNT SET RESOURCE_MONITOR=account_guard.",
                "url": "https://docs.snowflake.com/en/user-guide/resource-monitors"
              }
            },
            {
              "id": "SFCOST01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFCOST01_BP01 && SFCOST01_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFCOST01_BP01 || SFCOST01_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFCOST02",
          "title": "How do you right-size Snowflake virtual warehouses and minimize idle compute cost?",
          "description": "Warehouses consume credits only while running. Idle warehouses left running waste credits. Oversized warehouses for simple queries waste credits. Proper auto-suspend, auto-resume, and size selection directly reduce Snowflake costs.",
          "choices": [
            {
              "id": "SFCOST02_BP01",
              "title": "Set aggressive auto-suspend and validate warehouse size against actual query workload",
              "helpfulResource": {
                "displayText": "Set AUTO_SUSPEND = 60 seconds for all interactive warehouses and AUTO_SUSPEND = 120 seconds for ETL/batch warehouses. AUTO_RESUME = TRUE ensures zero latency to restart. Query WAREHOUSE_LOAD_HISTORY to identify under- or over-utilized warehouses and adjust size accordingly.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-overview"
              },
              "improvementPlan": {
                "displayText": "Run: ALTER WAREHOUSE <wh> SET AUTO_SUSPEND=60 AUTO_RESUME=TRUE; Query SNOWFLAKE.ACCOUNT_USAGE.WAREHOUSE_LOAD_HISTORY WHERE AVG_RUNNING < 0.1 to find idle warehouses. Use WAREHOUSE_METERING_HISTORY to correlate size vs credits per query.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-tasks"
              }
            },
            {
              "id": "SFCOST02_BP02",
              "title": "Use Snowflake Query Acceleration Service (QAS) for spiky workloads instead of upsizing",
              "helpfulResource": {
                "displayText": "QAS offloads large scans and aggregate queries to serverless compute automatically, reducing the need to maintain oversized warehouses for peak load. Enable on warehouses with high QUEUED_OVERLOAD_TIME or large query variance. Pay only for the acceleration used.",
                "url": "https://docs.snowflake.com/en/user-guide/query-acceleration-service"
              },
              "improvementPlan": {
                "displayText": "Enable with: ALTER WAREHOUSE <wh> SET ENABLE_QUERY_ACCELERATION=TRUE MAX_CONCURRENCY_LEVEL=8. Monitor cost impact via ACCOUNT_USAGE.QUERY_ACCELERATION_HISTORY. Compare credits before/after to validate savings vs warehouse upsizing.",
                "url": "https://docs.snowflake.com/en/user-guide/query-acceleration-service"
              }
            },
            {
              "id": "SFCOST02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFCOST02_BP01 && SFCOST02_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFCOST02_BP01 || SFCOST02_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFCOST03",
          "title": "How do you proactively cap and receive alerts on Snowflake credit spend?",
          "description": "Snowflake Budgets allow you to set spending thresholds and receive automated email and notification alerts before costs exceed expectations. Without proactive alerting, cost overruns are discovered only after billing cycles close.",
          "choices": [
            {
              "id": "SFCOST03_BP01",
              "title": "Create Snowflake Budget objects with threshold alerts for critical cost centers",
              "helpfulResource": {
                "displayText": "Use the BUDGET class (available in SNOWFLAKE.CORE) to define spending targets and notification thresholds. Budgets can scope to the full account, specific warehouses, databases, or custom object groups. Set thresholds at 80% and 100% of expected monthly spend.",
                "url": "https://docs.snowflake.com/en/user-guide/budgets"
              },
              "improvementPlan": {
                "displayText": "Run: CALL SNOWFLAKE.LOCAL.ACCOUNT_ROOT_BUDGET!SET_SPENDING_LIMIT(1000); CALL SNOWFLAKE.LOCAL.ACCOUNT_ROOT_BUDGET!ADD_EMAIL_NOTIFICATIONS('team@company.com'); Review actual vs budget in the Snowflake Cost Management UI or via SNOWFLAKE.CORE.BUDGET_MONITOR_USAGE.",
                "url": "https://docs.snowflake.com/en/user-guide/budgets"
              }
            },
            {
              "id": "SFCOST03_BP02",
              "title": "Integrate Snowflake budget alerts with external notification systems (SNS, PagerDuty, Slack)",
              "helpfulResource": {
                "displayText": "Snowflake notification integrations allow budget and alert triggers to route to AWS SNS, email, or webhooks. Connecting Snowflake budget alerts to your incident management platform ensures the right on-call team is notified in real time, not just via email.",
                "url": "https://docs.snowflake.com/en/user-guide/notifications/notification-integrations"
              },
              "improvementPlan": {
                "displayText": "Create: CREATE NOTIFICATION INTEGRATION budget_sns TYPE=QUEUE NOTIFICATION_PROVIDER=AWS_SNS ENABLED=TRUE AWS_SNS_TOPIC_ARN='<arn>' AWS_SNS_ROLE_ARN='<iam_role>'; Then reference this integration in ALERT or BUDGET notification configuration.",
                "url": "https://docs.snowflake.com/en/user-guide/notifications/notification-integrations"
              }
            },
            {
              "id": "SFCOST03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFCOST03_BP01 && SFCOST03_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFCOST03_BP01 || SFCOST03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFCOST04",
          "title": "How do you attribute Snowflake credit consumption to specific teams, workloads, or cost centers?",
          "description": "Without attribution, Snowflake costs appear as a single line item. Object tags and query tags enable per-team, per-pipeline cost attribution that drives accountability, chargeback, and optimization incentives.",
          "choices": [
            {
              "id": "SFCOST04_BP01",
              "title": "Apply object tags to warehouses and databases for cost center attribution",
              "helpfulResource": {
                "displayText": "Use Snowflake object tagging to label warehouses, databases, and schemas with cost_center, team, and environment attributes. Join ACCOUNT_USAGE.TAG_REFERENCES with WAREHOUSE_METERING_HISTORY to produce per-team credit reports. This enables chargeback and showback across business units.",
                "url": "https://docs.snowflake.com/en/user-guide/object-tagging"
              },
              "improvementPlan": {
                "displayText": "Create tags: CREATE TAG cost_center; CREATE TAG team; ALTER WAREHOUSE analytics_wh SET TAG cost_center='finance' team='analytics'; Query: SELECT tag_value, SUM(credits_used) FROM WAREHOUSE_METERING_HISTORY w JOIN TAG_REFERENCES t ON w.warehouse_name=t.object_name GROUP BY 1.",
                "url": "https://docs.snowflake.com/en/user-guide/object-tagging"
              }
            },
            {
              "id": "SFCOST04_BP02",
              "title": "Set QUERY_TAG on all programmatic sessions for query-level attribution in QUERY_HISTORY",
              "helpfulResource": {
                "displayText": "Setting a QUERY_TAG on every Lambda, Spark, or dbt session allows per-pipeline cost tracking in ACCOUNT_USAGE.QUERY_HISTORY via the CREDITS_USED_CLOUD_SERVICES column. This complements warehouse-level attribution with query-level granularity.",
                "url": "https://docs.snowflake.com/en/sql-reference/sql/alter-session"
              },
              "improvementPlan": {
                "displayText": "Set at session start: ALTER SESSION SET QUERY_TAG='pipeline=flight-ingest,team=data-eng,env=prod'. Query: SELECT query_tag, SUM(credits_used_cloud_services) FROM QUERY_HISTORY WHERE start_time >= DATEADD('day',-30,CURRENT_TIMESTAMP()) GROUP BY 1 ORDER BY 2 DESC.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/query_history"
              }
            },
            {
              "id": "SFCOST04_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFCOST04_BP01 && SFCOST04_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFCOST04_BP01 || SFCOST04_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFCOST05",
          "title": "How do you evaluate and choose between serverless and warehouse-based compute for cost efficiency?",
          "description": "Snowflake offers both serverless compute (Snowpipe, Tasks, Search Optimization, Cortex) and warehouse-based compute. Choosing the wrong model for a workload leads to systematic overpayment. Serverless is cheaper for irregular/small jobs, warehouse is cheaper for sustained high-throughput workloads.",
          "choices": [
            {
              "id": "SFCOST05_BP01",
              "title": "Use serverless features (Snowpipe Streaming, serverless Tasks) for event-driven and low-frequency workloads",
              "helpfulResource": {
                "displayText": "Snowpipe Streaming, serverless Tasks, and Cortex AI functions bill per compute-second with no warehouse spin-up cost. These are the most cost-efficient options for event-driven pipelines, scheduled light transforms, and AI inference with irregular traffic patterns.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview"
              },
              "improvementPlan": {
                "displayText": "Audit ACCOUNT_USAGE.SERVERLESS_TASK_HISTORY and PIPE_USAGE_HISTORY to understand serverless credit consumption. Compare against equivalent warehouse-based cost. For tasks running < 1 minute per execution, serverless is almost always cheaper. For sustained ETL jobs > 30 min/run, warehouse is cheaper.",
                "url": "https://docs.snowflake.com/en/user-guide/tasks-intro"
              }
            },
            {
              "id": "SFCOST05_BP02",
              "title": "Benchmark and document the cost crossover point between serverless and warehouse for key pipelines",
              "helpfulResource": {
                "displayText": "The break-even between serverless Snowpipe and warehouse-based COPY INTO is approximately 1000 files/hour. Above this, HP Snowpipe Streaming is cheaper. Below this, COPY INTO with a small warehouse is cheaper. Document these thresholds for each pipeline and revisit quarterly as volume changes.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview"
              },
              "improvementPlan": {
                "displayText": "Query PIPE_USAGE_HISTORY and WAREHOUSE_METERING_HISTORY to calculate credits/MB ingested for each approach. Build a cost model: serverless_credits_per_MB vs (warehouse_credits_per_hour / avg_MB_per_hour). Update quarterly as data volumes change.",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/pipe_usage_history"
              }
            },
            {
              "id": "SFCOST05_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFCOST05_BP01 && SFCOST05_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFCOST05_BP01 || SFCOST05_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    },
    {
      "id": "sustainability",
      "name": "Sustainability",
      "questions": [
        {
          "id": "SFSUS01",
          "title": "How do you optimize compute efficiency through warehouse consolidation?",
          "description": "Warehouse consolidation reduces idle compute resources by combining workloads with compatible performance requirements onto shared warehouses, lowering overall energy consumption and credit spend.",
          "choices": [
            {
              "id": "SFSUS01_BP01",
              "title": "Analyze warehouse utilization and consolidate underutilized warehouses",
              "helpfulResource": {
                "displayText": "Review WAREHOUSE_METERING_HISTORY and QUERY_HISTORY to identify warehouses with low utilization or overlapping workload profiles. Consolidate compatible workloads onto fewer, appropriately sized warehouses to reduce idle compute time and improve resource efficiency.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-overview"
              },
              "improvementPlan": {
                "displayText": "Query WAREHOUSE_METERING_HISTORY to identify warehouses with less than 30% average utilization. Analyze query patterns to identify workloads that can share a warehouse without SLA conflicts. Consolidate compatible workloads and resize the target warehouse appropriately. Enable auto-suspend with aggressive timeouts (60 seconds) for consolidated warehouses. Monitor post-consolidation performance to ensure SLAs are maintained.",
                "url": "https://docs.snowflake.com/en/user-guide/warehouses-overview"
              }
            },
            {
              "id": "SFSUS01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSUS01_BP01",
              "risk": "NO_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSUS02",
          "title": "How do you manage data retention to reduce unnecessary storage consumption?",
          "description": "Excessive data retention increases storage costs and environmental impact. Implementing data lifecycle policies ensures that only necessary data is retained, while expired or obsolete data is systematically purged.",
          "choices": [
            {
              "id": "SFSUS02_BP01",
              "title": "Implement data lifecycle and retention policies",
              "helpfulResource": {
                "displayText": "Define data retention policies that specify how long data should be retained in active Snowflake tables, Time Travel, and Fail-safe. Reduce Time Travel retention for non-critical tables to minimize storage. Use Snowflake streams and tasks to automate data archival to lower-cost storage tiers (e.g., Amazon S3 Glacier).",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              },
              "improvementPlan": {
                "displayText": "Classify tables by data criticality and set appropriate DATA_RETENTION_TIME_IN_DAYS (e.g., 1 day for staging tables, 7-90 days for production). Identify and drop unused or orphaned tables and schemas. Implement automated archival pipelines that move cold data to external stages backed by S3 with lifecycle policies to Glacier. Monitor total storage consumption trends monthly and set reduction targets.",
                "url": "https://docs.snowflake.com/en/user-guide/data-time-travel"
              }
            },
            {
              "id": "SFSUS02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSUS02_BP01",
              "risk": "NO_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSUS03",
          "title": "How do you optimize query efficiency to reduce unnecessary compute energy in Snowflake?",
          "description": "Inefficient queries  -  full table scans, missing clustering, repeated computation  -  waste compute credits and energy. Optimizing query patterns reduces both cost and carbon footprint without sacrificing functionality.",
          "choices": [
            {
              "id": "SFSUS03_BP01",
              "title": "Use clustering keys and Search Optimization to reduce bytes scanned per query",
              "helpfulResource": {
                "displayText": "Adding clustering keys on high-cardinality filter columns (e.g., event_date, region) dramatically reduces micro-partition pruning overhead and bytes scanned. Search Optimization Service adds point-lookup acceleration for equality predicates. Both reduce per-query compute and therefore energy.",
                "url": "https://docs.snowflake.com/en/user-guide/tables-clustering-keys"
              },
              "improvementPlan": {
                "displayText": "Identify clustering candidates: SELECT table_name, clustering_key, average_overlaps, average_depth FROM SNOWFLAKE.ACCOUNT_USAGE.TABLE_STORAGE_METRICS WHERE average_overlaps > 10; Add clustering: ALTER TABLE events CLUSTER BY (event_date, region). Monitor with SYSTEM$CLUSTERING_INFORMATION('events','(event_date, region)').",
                "url": "https://docs.snowflake.com/en/user-guide/tables-clustering-keys"
              }
            },
            {
              "id": "SFSUS03_BP02",
              "title": "Leverage result caching and avoid redundant computation across pipelines",
              "helpfulResource": {
                "displayText": "Snowflake's result cache returns query results instantly for identical queries within 24 hours — zero warehouse compute, zero energy. Design pipelines to reuse cached results where possible. Use Dynamic Tables for incremental refresh (process only changed rows) instead of full re-computation each run.",
                "url": "https://docs.snowflake.com/en/user-guide/querying-persisted-results"
              },
              "improvementPlan": {
                "displayText": "Monitor cache hit rate: SELECT (SUM(result_cache_hits)/COUNT(*))::FLOAT AS cache_hit_rate FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY WHERE start_time >= DATEADD('day',-7,CURRENT_TIMESTAMP()); Target > 30% cache hit rate for analytics workloads. Refactor queries that re-scan the same data within 24 hours.",
                "url": "https://docs.snowflake.com/en/user-guide/querying-persisted-results"
              }
            },
            {
              "id": "SFSUS03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSUS03_BP01 && SFSUS03_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSUS03_BP01 || SFSUS03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFSUS04",
          "title": "How do you leverage Snowflake's multi-tenant shared infrastructure for sustainability benefits?",
          "description": "Snowflake's shared multi-tenant architecture achieves 7-10x better compute utilization than dedicated single-tenant databases. Understanding and communicating this advantage helps justify Snowflake adoption in sustainability-focused organizations.",
          "choices": [
            {
              "id": "SFSUS04_BP01",
              "title": "Prefer Snowflake-managed serverless features over self-managed compute for eligible workloads",
              "helpfulResource": {
                "displayText": "Snowflake serverless features (Snowpipe Streaming, serverless Tasks, Cortex AI, Search Optimization) run on Snowflake's shared compute pool, which is bin-packed across thousands of tenants for maximum utilization efficiency. These are more sustainable than dedicated warehouse compute for irregular or low-throughput workloads.",
                "url": "https://docs.snowflake.com/en/user-guide/cost-understanding-overall"
              },
              "improvementPlan": {
                "displayText": "Audit which workloads use dedicated warehouse compute vs serverless: compare SERVERLESS_TASK_HISTORY credits vs WAREHOUSE_METERING_HISTORY credits. Migrate eligible scheduled tasks (< 5 min runtime, < hourly frequency) from warehouse-based to serverless. Serverless Tasks require no warehouse and use shared compute.",
                "url": "https://docs.snowflake.com/en/user-guide/tasks-serverless-vs-user-managed"
              }
            },
            {
              "id": "SFSUS04_BP02",
              "title": "Colocate Snowflake account and AWS workloads in the same region to minimize cross-region data transfer",
              "helpfulResource": {
                "displayText": "Cross-region data transfer between AWS and Snowflake generates additional network energy consumption and egress costs. Ensuring the Snowflake account is in the same AWS region as data producers eliminates this overhead and keeps data within the same physical datacenter footprint.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-account-identifier"
              },
              "improvementPlan": {
                "displayText": "Verify colocation: compare Snowflake account region (SELECT CURRENT_REGION()) with AWS region of your Kinesis/MSK/S3 resources. If they differ, evaluate the cost and sustainability impact of migrating the Snowflake account or adding a regional replication target in the matching region.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-account-identifier"
              }
            },
            {
              "id": "SFSUS04_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFSUS04_BP01 && SFSUS04_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFSUS04_BP01 || SFSUS04_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    },
    {
      "id": "jointIntegration",
      "name": "Joint Integration (AWS + Snowflake)",
      "questions": [
        {
          "id": "SFJOINT01",
          "title": "How do you validate the end-to-end credential handoff between AWS and Snowflake?",
          "description": "The AWStoSnowflake authentication chain (IAM role to Secrets Manager to RSA private key to Snowflake keypair auth) has multiple failure points. A break at any link silently stops ingestion. End-to-end validation ensures the chain is tested, monitored, and rotatable.",
          "choices": [
            {
              "id": "SFJOINT01_BP01",
              "title": "Document and test the full credential chain from IAM to Snowflake in a runbook",
              "helpfulResource": {
                "displayText": "Map the full auth path: Lambda execution role → GetSecretValue on SM secret → RSA PEM key → Snowflake JWT assertion → Snowflake session. Test end-to-end by invoking the Lambda with a test record and confirming data appears in Snowflake. Include in your deployment checklist and disaster recovery runbook.",
                "url": "https://docs.snowflake.com/en/user-guide/key-pair-auth"
              },
              "improvementPlan": {
                "displayText": "Validate with: (1) aws secretsmanager get-secret-value --secret-id <name> (confirms IAM access); (2) run Lambda test event and check CloudWatch for Snowflake auth errors; (3) SELECT CURRENT_USER(), CURRENT_ROLE() in Snowflake to confirm identity. Automate this validation in CI/CD post-deployment smoke test.",
                "url": "https://docs.snowflake.com/en/user-guide/key-pair-auth"
              }
            },
            {
              "id": "SFJOINT01_BP02",
              "title": "Implement automated RSA key rotation without pipeline downtime",
              "helpfulResource": {
                "displayText": "RSA private keys should be rotated periodically (at minimum annually, ideally quarterly). Snowflake supports two simultaneous RSA public keys per user (RSA_PUBLIC_KEY and RSA_PUBLIC_KEY_2) enabling zero-downtime rotation: register new key as key 2, update SM secret, verify pipeline uses new key, then remove old key.",
                "url": "https://docs.snowflake.com/en/user-guide/key-pair-auth-key-rotation"
              },
              "improvementPlan": {
                "displayText": "Rotation procedure: (1) Generate new keypair; (2) ALTER USER <user> SET RSA_PUBLIC_KEY_2='<new_pub_key>'; (3) Update SM secret with new private key; (4) Trigger Lambda invocation and confirm success; (5) ALTER USER <user> UNSET RSA_PUBLIC_KEY (remove old key); (6) Confirm only RSA_PUBLIC_KEY_2 remains active.",
                "url": "https://docs.snowflake.com/en/user-guide/key-pair-auth-key-rotation"
              }
            },
            {
              "id": "SFJOINT01_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFJOINT01_BP01 && SFJOINT01_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFJOINT01_BP01 || SFJOINT01_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFJOINT02",
          "title": "How do you secure and validate the network path between AWS compute and Snowflake?",
          "description": "Without network controls, AWS Lambda or ECS tasks connect to Snowflake over the public internet, exposing data to exfiltration risk and bypassing compliance requirements for private connectivity. PrivateLink eliminates this exposure.",
          "choices": [
            {
              "id": "SFJOINT02_BP01",
              "title": "Deploy AWS compute in a VPC and connect to Snowflake via PrivateLink private endpoint",
              "helpfulResource": {
                "displayText": "AWS PrivateLink for Snowflake creates a private endpoint in your VPC that routes all traffic to Snowflake without traversing the internet. Lambda, ECS, and Glue deployed in a VPC route through this endpoint. Combined with a Snowflake network policy allowing only the VPC endpoint IP, this eliminates all public internet exposure.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security-privatelink"
              },
              "improvementPlan": {
                "displayText": "Steps: (1) Enable PrivateLink on Snowflake account (Snowflake Support or Business Critical edition); (2) Create VPC Interface Endpoint in AWS pointing to Snowflake's service name; (3) Update Snowflake account URL to use privatelink endpoint; (4) Add Snowflake network policy allowing only the endpoint IP; (5) Test connectivity from Lambda/ECS inside the VPC.",
                "url": "https://docs.snowflake.com/en/user-guide/admin-security-privatelink"
              }
            },
            {
              "id": "SFJOINT02_BP02",
              "title": "Configure Snowflake network policy to allowlist only known AWS source IPs",
              "helpfulResource": {
                "displayText": "Even without PrivateLink, a Snowflake network policy restricting access to your AWS NAT Gateway IPs (for Lambda) or VPC CIDR prevents access from unauthorized sources. This is the minimum network control when PrivateLink is not yet deployed.",
                "url": "https://docs.snowflake.com/en/user-guide/network-policies"
              },
              "improvementPlan": {
                "displayText": "Create: CREATE NETWORK RULE aws_nat_rule TYPE=IPV4 VALUE_LIST=('<nat_gw_ip>/32') MODE=INGRESS; CREATE NETWORK POLICY aws_only_policy ALLOWED_NETWORK_RULE_LIST=(aws_nat_rule); ALTER ACCOUNT SET NETWORK_POLICY=aws_only_policy; Verify: SELECT * FROM SNOWFLAKE.ACCOUNT_USAGE.LOGIN_HISTORY to confirm only expected IPs appear.",
                "url": "https://docs.snowflake.com/en/user-guide/network-policies"
              }
            },
            {
              "id": "SFJOINT02_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFJOINT02_BP01 && SFJOINT02_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFJOINT02_BP01 || SFJOINT02_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFJOINT03",
          "title": "How do AWS-side pipeline failures surface in Snowflake monitoring and vice versa?",
          "description": "AWS DLQ events, Lambda errors, and Kinesis iterator age increases are invisible in Snowflake monitoring. Snowflake Task failures and pipe errors are invisible in CloudWatch. Without bridging these, incidents span both systems undetected until data freshness SLAs breach.",
          "choices": [
            {
              "id": "SFJOINT03_BP01",
              "title": "Create a unified alerting bridge: route AWS CloudWatch alarms and Snowflake ALERTs to the same notification channel",
              "helpfulResource": {
                "displayText": "Route AWS CloudWatch alarms (Lambda errors, DLQ depth, Kinesis IteratorAge) and Snowflake ALERT objects (Task failures, pipe errors, query failures) to the same SNS topic or PagerDuty service. This creates a single pane of glass for cross-platform pipeline health without requiring a separate monitoring tool.",
                "url": "https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html"
              },
              "improvementPlan": {
                "displayText": "AWS side: Create CloudWatch Alarms on Lambda Errors, SQS ApproximateNumberOfMessagesVisible (DLQ), and Kinesis GetRecords.IteratorAgeMilliseconds → route to SNS. Snowflake side: Create ALERT on TASK_HISTORY failures → route to same SNS via notification integration. Both alert types land in the same PagerDuty/Slack channel.",
                "url": "https://docs.snowflake.com/en/user-guide/notifications/notification-integrations"
              }
            },
            {
              "id": "SFJOINT03_BP02",
              "title": "Build an end-to-end latency monitor that measures AWS ingestion lag through to Snowflake data freshness",
              "helpfulResource": {
                "displayText": "E2E latency = (Kinesis PutRecord timestamp) - (Snowflake row LAST_CONTACT or ingestion timestamp). Measure this by comparing the MAX event timestamp in the Snowflake table against current time. Alert if data freshness exceeds your SLA (e.g., > 5 minutes for near-real-time pipelines).",
                "url": "https://docs.snowflake.com/en/sql-reference/account-usage/pipe_usage_history"
              },
              "improvementPlan": {
                "displayText": "Create a Snowflake scheduled task: SELECT DATEDIFF('second', MAX(time_position), CURRENT_TIMESTAMP()) AS lag_seconds FROM flight_data WHERE time_position IS NOT NULL; If lag_seconds > 300, trigger alert. Combine with CloudWatch custom metric publishing from Lambda (emit ingest timestamp as metric) for full visibility.",
                "url": "https://docs.snowflake.com/en/user-guide/tasks-intro"
              }
            },
            {
              "id": "SFJOINT03_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFJOINT03_BP01 && SFJOINT03_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFJOINT03_BP01 || SFJOINT03_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFJOINT04",
          "title": "How do you maintain a shared schema contract between AWS data producers and Snowflake consumers?",
          "description": "When AWS producers (Lambda, Glue, Kafka connectors) and Snowflake consumers (dashboards, ML models, dbt models) evolve independently, schema mismatches cause silent data loss, downstream breakage, or ingestion failures. A schema contract prevents uncoordinated changes.",
          "choices": [
            {
              "id": "SFJOINT04_BP01",
              "title": "Use Snowflake schema evolution with MATCH_BY_COLUMN_NAME as a safe default for additive changes",
              "helpfulResource": {
                "displayText": "Enable ENABLE_SCHEMA_EVOLUTION=TRUE on ingest tables so new fields from producers auto-add columns instead of failing. Use MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE in COPY INTO / pipe definitions so field reordering is transparent. This handles additive changes (new fields) without producer-consumer coordination.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-schema-evolution"
              },
              "improvementPlan": {
                "displayText": "ALTER TABLE <table> SET ENABLE_SCHEMA_EVOLUTION=TRUE; GRANT EVOLVE SCHEMA ON TABLE <table> TO ROLE <ingest_role>; Test: send a record with a new field and confirm DESCRIBE TABLE shows the new column. Monitor ACCOUNT_USAGE.QUERY_HISTORY for ALTER TABLE events generated by schema evolution to track when producers add fields.",
                "url": "https://docs.snowflake.com/en/user-guide/data-load-schema-evolution"
              }
            },
            {
              "id": "SFJOINT04_BP02",
              "title": "Implement a schema registry or contract test to catch breaking changes before production",
              "helpfulResource": {
                "displayText": "For breaking changes (column removal, type change, rename), a schema registry or contract test in CI/CD is required. Run a Snowflake DESCRIBE TABLE against the expected schema in CI. If the actual schema diverges from the contract, fail the deployment. This prevents broken dashboards and ML features from reaching production.",
                "url": "https://docs.snowflake.com/en/sql-reference/sql/desc-table"
              },
              "improvementPlan": {
                "displayText": "In CI pipeline: snow sql -q 'DESCRIBE TABLE prod_db.public.events' | compare columns against schema_contract.json; fail CI if columns removed or types changed. Maintain schema_contract.json in the same repo as the producer (Lambda handler) so producer and contract evolve together via PR.",
                "url": "https://docs.snowflake.com/en/developer-guide/snowflake-cli/index"
              }
            },
            {
              "id": "SFJOINT04_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFJOINT04_BP01 && SFJOINT04_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFJOINT04_BP01 || SFJOINT04_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        },
        {
          "id": "SFJOINT05",
          "title": "How do you govern AI/ML workloads that span AWS Bedrock or SageMaker and Snowflake Cortex?",
          "description": "Joint AI/ML pipelines (data in Snowflake to model in SageMaker/Bedrock to inference results back to Snowflake) introduce new governance gaps: PII in prompt contexts, model lineage, AI credit costs, and inference result quality. Neither the AWS lens nor the Snowflake lens addresses these jointly.",
          "choices": [
            {
              "id": "SFJOINT05_BP01",
              "title": "Apply data governance controls to all data passed to LLMs via Cortex AI functions or Bedrock",
              "helpfulResource": {
                "displayText": "Data passed to Cortex AI functions (CORTEX.COMPLETE, AI_CLASSIFY, AI_EXTRACT) or exported to Bedrock for inference must be governed the same as any sensitive data. Apply column masking on PII columns before they enter AI pipelines. Use row access policies to prevent unauthorized roles from accessing AI-enriched outputs.",
                "url": "https://docs.snowflake.com/en/user-guide/snowflake-cortex/llm-functions"
              },
              "improvementPlan": {
                "displayText": "Before using AI functions on sensitive data: (1) Run AI_CLASSIFY or EXTRACT on a masked/anonymized copy; (2) Apply dynamic data masking on PII columns for non-privileged roles; (3) Audit QUERY_HISTORY for CORTEX.COMPLETE calls to confirm only authorized roles are using AI functions; (4) Tag AI-derived columns with a 'ai_generated' object tag.",
                "url": "https://docs.snowflake.com/en/user-guide/snowflake-cortex/llm-functions"
              }
            },
            {
              "id": "SFJOINT05_BP02",
              "title": "Monitor and attribute Snowflake Cortex AI credit consumption and establish per-model cost guardrails",
              "helpfulResource": {
                "displayText": "Cortex AI functions (COMPLETE, EXTRACT, CLASSIFY) consume Snowflake credits based on tokens processed. Without monitoring, AI feature development can unexpectedly exhaust credit budgets. Track AI credit consumption via ACCOUNT_USAGE.METERING_HISTORY and set resource monitor thresholds specific to AI workloads.",
                "url": "https://docs.snowflake.com/en/user-guide/cost-understanding-compute-credits"
              },
              "improvementPlan": {
                "displayText": "Query AI credit usage: SELECT service_type, SUM(credits_used) FROM SNOWFLAKE.ACCOUNT_USAGE.METERING_HISTORY WHERE service_type LIKE '%CORTEX%' GROUP BY 1; Create a resource monitor scoped to AI workloads. For Bedrock: use AWS Cost Explorer filtered by service=Bedrock and tag=ai-pipeline to track inference costs alongside Snowflake Cortex costs.",
                "url": "https://docs.snowflake.com/en/user-guide/cost-understanding-compute-credits"
              }
            },
            {
              "id": "SFJOINT05_no",
              "title": "None of these",
              "helpfulResource": {
                "displayText": "Choose this if your workload does not follow these best practices."
              }
            }
          ],
          "riskRules": [
            {
              "condition": "SFJOINT05_BP01 && SFJOINT05_BP02",
              "risk": "NO_RISK"
            },
            {
              "condition": "SFJOINT05_BP01 || SFJOINT05_BP02",
              "risk": "MEDIUM_RISK"
            },
            {
              "condition": "default",
              "risk": "HIGH_RISK"
            }
          ]
        }
      ]
    }
  ]
}
