Configuration

Configuration guide

This document describes configuration options available for the Helm chart and the virtual kubelet. Both can be obviously set from shared values.yaml file.

Deployment

deployment:
  # virtual kubelet pod can run on any node in the cluster; limit the restarts and rescheduling for better performance
  affinity: {}
  nodeSelector: {}
  tolerations: []

  # VK Service Account annotation allows you to reference AWS IAM role and gain permission to generate an access token for ECR
  serviceAccount:
    annotations: {}

  gateway:
    # enum: [aws-eks | node-port]
    class: ""
    # hostname or IP address of the gateway
    domain: ""
    # effective only with 'node-port' class
    nodePort: 31000

  containers:
    virtualKubelet:
      image:
        repository: "391135486350.dkr.ecr.eu-central-1.amazonaws.com/gpu-provider"
        tag: latest
        pullPolicy: Always

      resources:
        requests:
          cpu: "0.5"
          memory: "512Mi"
        limits:
          cpu: "0.5"
          memory: "512Mi"

    gateway:
      image:
        repository: "public.ecr.aws/m4v1f8q5/gpu-provider/gateway"
        tag: latest
        pullPolicy: Always

      resources:
        requests:
          cpu: "0.5"
          memory: "256Mi"
        limits:
          cpu: "0.5"
          memory: "256Mi"

Virtual Kubelet Configuration

appConfig:
  logLevel: "info"

  # multiple clusters can run under same cloud provider account;
  # correctly differentiate them by clusterUUID so garbage collection works properly
  cluster:
    clusterUUID: ""

  cloudProvider:
    vastAI:
      apiKey: ""

  virtualNode:
    # virtual node is created under with following name 
    nodeName: "virtual-node"
    # current hard limit is 64 pods per node
    podLimit: "10"
    # virtual node will report the following resources and scheduler will consider them when scheduling pods 
    cpu: "1000"
    memory: "1000Gi"
    labels: {}
    # additional taints for the node; see default taints below
    taints: []

  provisioning:
    # maximum provisioning attempts; most provisioning steps are retried until timeout;  
    maxRetries: 10
    
    # timeout for complete provisioning
    startupTimeout: "10m"
    
    # container agent reports status of a remote containers; should it fail to report within this timeout; 
    # the pod will be restarted and the running machine destroyed
    statusReportTimeout: "2m"
    
    # should a machine fail during provisioning or fail to start within before startupTimeout
    # it is banned from further attempts for specified period 
    machineBansStore:
      localFile:
        enable: true
        timeout: "1d"


  # push container logs to the Loki gateway; right now this is the only way to export logs on managed Kubernetes 
  promtail:
    enable: false
    clients: []
    # clients: []
    #   - url: "https://loki-gateway.example.com/loki/api/v1/push"
    #     basicAuth:
    #       username: ""
    #       password: ""

Last modified January 30, 2026: docs (a584816)