vcjob

apiVersion: v1
kind: ConfigMap
metadata:
  name: rings-config-mindx-dls-test     # The value of JobName must be the same as the name attribute of the following job. The prefix rings-config- cannot be modified.
  namespace: vcjob                      # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.)
  labels:
    ring-controller.atlas: ascend-910   # The value cannot be modified. Service operations will be performed based on this label.
data:
  hccl.json: |
    {
        "status":"initializing"
    }
---
apiVersion: batch.volcano.sh/v1alpha1   # The value cannot be changed. The volcano API must be used.
kind: Job                               # Only the job type is supported at present.
metadata:
  name: mindx-dls-test                  # The value must be consistent with the name of ConfigMap.
  namespace: vcjob                      # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.)
  labels:
    ring-controller.atlas: ascend-910   # The value must be the same as the label in ConfigMap and cannot be changed.
    fault-scheduling: "force"
spec:
  minAvailable: 1                       # The value of minAvailable is 1 in a single-node scenario and N in an N-node distributed scenario.
  schedulerName: volcano                # Use the Volcano scheduler to schedule jobs.
  policies:
    - event: PodEvicted
      action: RestartJob
  plugins:
    ssh: []
    env: []
    svc: []
  maxRetry: 3
  queue: default
  tasks:
  - name: "default-test"
    replicas: 1                              # The value of replicas is 1 in a single-node scenario and N in an N-node scenario. The number of NPUs in the requests field is 8 in an N-node scenario.
    template:
      metadata:
        labels:
          app: pytorch
          ring-controller.atlas: ascend-910  # The value must be the same as the label in ConfigMap and cannot be changed.
      spec:
        affinity:
          podAntiAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
              - labelSelector:
                  matchExpressions:
                    - key: volcano.sh/job-name
                      operator: In
                      values:
                        - mindx-dls-test
                topologyKey: kubernetes.io/hostname
        hostNetwork: true
        containers:
        - image: torch:b030               # Training framework image, which can be modified.
          imagePullPolicy: IfNotPresent
          name: pytorch
          env:
          - name: mindx-dls-test                               # The value must be the same as that of Jobname.
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          - name: XDL_IP                                       # IP address of the physical node, which is used to identify the node where the pod is running
            valueFrom:
              fieldRef:
                fieldPath: status.hostIP
          - name: framework
            value: "PyTorch"
          command:
          - "/bin/bash"
          - "-c"
          # Commands for running the training script. Ensure that the involved commands and paths exist on Docker.
          - "cd /job/code/ResNet50_for_PyTorch_1.8_code/scripts;chmod +x train_start.sh;bash train_start.sh /job/code/ResNet50_for_PyTorch_1.8_code/ /job/output/ DistributedResnet50/main_apex_d76_npu.py --data=/job/data/resnet50/imagenet --seed=49 --worker=128 --learning-rate=1.6 --warmup=8 --label-smoothing=0.1 --mom=0.9 --weight-decay=1.0e-04 --static-loss-scale=128 --print-freq=1 --dist-url='tcp://127.0.0.1:50000' --dist-backend='hccl' --multiprocessing-distributed --benchmark=0 --device='npu' --epoch=90 --batch-size=1024;"
            #args: [ "while true; do sleep 30000; done;"  ]     # Comment out the preceding line and enable this line. You can manually run the training script in the container to facilitate debugging.
                                                              # The command is 'kubectl exec -it -n {namespace} {podname} bash'
          resources:
            requests:
              huawei.com/Ascend910: 8                 # Number of required NPUs. The maximum value is 8. You can add lines below to configure resources such as memory and CPU.
            limits:
              huawei.com/Ascend910: 8                 # The value must be consistent with that in requests.
          volumeMounts:
          - name: ascend-910-config
            mountPath: /user/serverid/devindex/config
          - name: code
            mountPath: /job/code/                     # Path of the training script in the container.
          - name: data
            mountPath: /job/data                      # Path of the training dataset in the container.
          - name: output
            mountPath: /job/output                    # Training output path in the container.
          - name: slog
            mountPath: /var/log/npu
          - name: ascend-driver
            mountPath: /usr/local/Ascend/driver
          - name: ascend-add-ons
            mountPath: /usr/local/Ascend/add-ons
          - name: dshm
            mountPath: /dev/shm
          - name: localtime
            mountPath: /etc/localtime
        nodeSelector:
          host-arch: huawei-arm                       # Configure the label based on the actual job.
        volumes:
        - name: ascend-910-config
          configMap:
            name: rings-config-mindx-dls-test         # Correspond to the ConfigMap name above.
        - name: code
          nfs:
            server: 127.0.0.1                         # IP address of the NFS server. In this example, the shared path is /data/atlas_dls/.
            path: "/data/atlas_dls/public/code/"             # Configure the training script path.
        - name: data
          nfs:
            server: 127.0.0.1
            path: "/data/atlas_dls/public/dataset"    # Configure the path of the training set.
        - name: output
          nfs:
            server: 127.0.0.1
            path: "/data/atlas_dls/output/"           # Configure the path for saving the configuration model, which is related to the script.
        - name: slog
          hostPath:
            path: /var/log/npu                        # Configure the NPU log path and mount it to the local host.
        - name: ascend-driver
          hostPath:
            path: /usr/local/Ascend/driver
        - name: ascend-add-ons
          hostPath:
            path: /usr/local/Ascend/add-ons
        - name: localtime
          hostPath:
            path: /etc/localtime                      # Configure the Docker time.
        - name: dshm
          emptyDir:
              medium: Memory
              sizeLimit: 16Gi
        restartPolicy: OnFailure

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章