/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"os"
"path/filepath"
"github.com/spf13/cobra"
)
type commonFlags struct {
Master string
Kubeconfig string
}
func initFlags(cmd *cobra.Command, cf *commonFlags) {
cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
kubeConfFile := os.Getenv("KUBECONFIG")
if kubeConfFile == "" {
if home := homeDir(); home != "" {
kubeConfFile = filepath.Join(home, ".kube", "config")
}
}
cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/volcano/pkg/cli/util"
)
type deleteFlags struct {
commonFlags
Namespace string
JobName string
}
var deleteJobFlags = &deleteFlags{}
// InitDeleteFlags init the delete command flags.
func InitDeleteFlags(cmd *cobra.Command) {
initFlags(cmd, &deleteJobFlags.commonFlags)
cmd.Flags().StringVarP(&deleteJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
cmd.Flags().StringVarP(&deleteJobFlags.JobName, "name", "N", "", "the name of job")
}
// DeleteJob delete the job.
func DeleteJob() error {
config, err := util.BuildConfig(deleteJobFlags.Master, deleteJobFlags.Kubeconfig)
if err != nil {
return err
}
if deleteJobFlags.JobName == "" {
err := fmt.Errorf("job name is mandatory to delete a particular job")
return err
}
jobClient := versioned.NewForConfigOrDie(config)
err = jobClient.BatchV1alpha1().Jobs(deleteJobFlags.Namespace).Delete(context.TODO(), deleteJobFlags.JobName, metav1.DeleteOptions{})
if err != nil {
return err
}
fmt.Printf("delete job %v successfully\n", deleteJobFlags.JobName)
return nil
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"io"
"os"
"strings"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/volcano/pkg/cli/util"
)
type listFlags struct {
commonFlags
Namespace string
SchedulerName string
allNamespace bool
selector string
}
const (
// Name name etc below key words are used in job print format
Name string = "Name"
// Creation create
Creation string = "Creation"
// Phase phase
Phase string = "Phase"
// Replicas replicas
Replicas string = "Replicas"
// Min minimum
Min string = "Min"
// Scheduler scheduler
Scheduler string = "Scheduler"
// Pending pending
Pending string = "Pending"
// Running running
Running string = "Running"
// Succeeded success
Succeeded string = "Succeeded"
// Terminating terminating
Terminating string = "Terminating"
// Version version
Version string = "Version"
// Failed failed
Failed string = "Failed"
// Unknown pod
Unknown string = "Unknown"
// RetryCount retry count
RetryCount string = "RetryCount"
// JobType job type
JobType string = "JobType"
// Namespace job namespace
Namespace string = "Namespace"
)
var listJobFlags = &listFlags{}
// InitListFlags init list command flags.
func InitListFlags(cmd *cobra.Command) {
initFlags(cmd, &listJobFlags.commonFlags)
cmd.Flags().StringVarP(&listJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
cmd.Flags().StringVarP(&listJobFlags.SchedulerName, "scheduler", "S", "", "list job with specified scheduler name")
cmd.Flags().BoolVarP(&listJobFlags.allNamespace, "all-namespaces", "", false, "list jobs in all namespaces")
cmd.Flags().StringVarP(&listJobFlags.selector, "selector", "", "", "fuzzy matching jobName")
}
// ListJobs lists all jobs details.
func ListJobs() error {
config, err := util.BuildConfig(listJobFlags.Master, listJobFlags.Kubeconfig)
if err != nil {
return err
}
if listJobFlags.allNamespace {
listJobFlags.Namespace = ""
}
jobClient := versioned.NewForConfigOrDie(config)
jobs, err := jobClient.BatchV1alpha1().Jobs(listJobFlags.Namespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
return err
}
if len(jobs.Items) == 0 {
fmt.Printf("No resources found\n")
return nil
}
PrintJobs(jobs, os.Stdout)
return nil
}
// PrintJobs prints all jobs details.
func PrintJobs(jobs *v1alpha1.JobList, writer io.Writer) {
maxLenInfo := getMaxLen(jobs)
titleFormat := "%%-%ds%%-15s%%-12s%%-12s%%-12s%%-6s%%-10s%%-10s%%-12s%%-10s%%-12s%%-10s\n"
contentFormat := "%%-%ds%%-15s%%-12s%%-12s%%-12d%%-6d%%-10d%%-10d%%-12d%%-10d%%-12d%%-10d\n"
var err error
if listJobFlags.allNamespace {
_, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds"+titleFormat, maxLenInfo[1], maxLenInfo[0]),
Namespace, Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
} else {
_, err = fmt.Fprintf(writer, fmt.Sprintf(titleFormat, maxLenInfo[0]),
Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
}
if err != nil {
fmt.Printf("Failed to print list command result: %s.\n", err)
}
for _, job := range jobs.Items {
if listJobFlags.SchedulerName != "" && listJobFlags.SchedulerName != job.Spec.SchedulerName {
continue
}
if !strings.Contains(job.Name, listJobFlags.selector) {
continue
}
replicas := int32(0)
for _, ts := range job.Spec.Tasks {
replicas += ts.Replicas
}
jobType := job.ObjectMeta.Labels[v1alpha1.JobTypeKey]
if jobType == "" {
jobType = "Batch"
}
if listJobFlags.allNamespace {
_, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds"+contentFormat, maxLenInfo[1], maxLenInfo[0]),
job.Namespace, job.Name, job.CreationTimestamp.Format("2006-01-02"), job.Status.State.Phase, jobType, replicas,
job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
} else {
_, err = fmt.Fprintf(writer, fmt.Sprintf(contentFormat, maxLenInfo[0]),
job.Name, job.CreationTimestamp.Format("2006-01-02"), job.Status.State.Phase, jobType, replicas,
job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
}
if err != nil {
fmt.Printf("Failed to print list command result: %s.\n", err)
}
}
}
func getMaxLen(jobs *v1alpha1.JobList) []int {
maxNameLen := len(Name)
maxNamespaceLen := len(Namespace)
for _, job := range jobs.Items {
if len(job.Name) > maxNameLen {
maxNameLen = len(job.Name)
}
if len(job.Namespace) > maxNamespaceLen {
maxNamespaceLen = len(job.Namespace)
}
}
return []int{maxNameLen + 3, maxNamespaceLen + 3}
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
"github.com/spf13/cobra"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/volcano/pkg/cli/util"
)
type resumeFlags struct {
commonFlags
Namespace string
JobName string
}
var resumeJobFlags = &resumeFlags{}
// InitResumeFlags init resume command flags.
func InitResumeFlags(cmd *cobra.Command) {
initFlags(cmd, &resumeJobFlags.commonFlags)
cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "N", "", "the name of job")
}
// ResumeJob resumes the job.
func ResumeJob() error {
config, err := util.BuildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
if err != nil {
return err
}
if resumeJobFlags.JobName == "" {
err := fmt.Errorf("job name is mandatory to resume a particular job")
return err
}
return createJobCommand(config,
resumeJobFlags.Namespace, resumeJobFlags.JobName,
v1alpha1.ResumeJobAction)
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"io/ioutil"
"strings"
"github.com/spf13/cobra"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/yaml"
vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/volcano/pkg/cli/util"
)
type runFlags struct {
commonFlags
Name string
Namespace string
Image string
MinAvailable int
Replicas int
Requests string
Limits string
SchedulerName string
FileName string
}
var launchJobFlags = &runFlags{}
// InitRunFlags init the run flags.
func InitRunFlags(cmd *cobra.Command) {
initFlags(cmd, &launchJobFlags.commonFlags)
cmd.Flags().StringVarP(&launchJobFlags.Image, "image", "i", "busybox", "the container image of job")
cmd.Flags().StringVarP(&launchJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
cmd.Flags().StringVarP(&launchJobFlags.Name, "name", "N", "", "the name of job")
cmd.Flags().IntVarP(&launchJobFlags.MinAvailable, "min", "m", 1, "the minimal available tasks of job")
cmd.Flags().IntVarP(&launchJobFlags.Replicas, "replicas", "r", 1, "the total tasks of job")
cmd.Flags().StringVarP(&launchJobFlags.Requests, "requests", "R", "cpu=1000m,memory=100Mi", "the resource request of the task")
cmd.Flags().StringVarP(&launchJobFlags.Limits, "limits", "L", "cpu=1000m,memory=100Mi", "the resource limit of the task")
cmd.Flags().StringVarP(&launchJobFlags.SchedulerName, "scheduler", "S", "volcano", "the scheduler for this job")
cmd.Flags().StringVarP(&launchJobFlags.FileName, "filename", "f", "", "the yaml file of job")
}
var jobName = "job.volcano.sh"
// RunJob creates the job.
func RunJob() error {
config, err := util.BuildConfig(launchJobFlags.Master, launchJobFlags.Kubeconfig)
if err != nil {
return err
}
if launchJobFlags.Name == "" && launchJobFlags.FileName == "" {
err = fmt.Errorf("job name cannot be left blank")
return err
}
req, err := populateResourceListV1(launchJobFlags.Requests)
if err != nil {
return err
}
limit, err := populateResourceListV1(launchJobFlags.Limits)
if err != nil {
return err
}
job, err := readFile(launchJobFlags.FileName)
if err != nil {
return err
}
if job == nil {
job = constructLaunchJobFlagsJob(launchJobFlags, req, limit)
}
jobClient := versioned.NewForConfigOrDie(config)
newJob, err := jobClient.BatchV1alpha1().Jobs(launchJobFlags.Namespace).Create(context.TODO(), job, metav1.CreateOptions{})
if err != nil {
return err
}
if newJob.Spec.Queue == "" {
newJob.Spec.Queue = "default"
}
fmt.Printf("run job %v successfully\n", newJob.Name)
return nil
}
func readFile(filename string) (*vcbatch.Job, error) {
if filename == "" {
return nil, nil
}
if !strings.Contains(filename, ".yaml") && !strings.Contains(filename, ".yml") {
return nil, fmt.Errorf("only support yaml file")
}
file, err := ioutil.ReadFile(filename)
if err != nil {
return nil, fmt.Errorf("failed to read file, err: %v", err)
}
var job vcbatch.Job
if err := yaml.Unmarshal(file, &job); err != nil {
return nil, fmt.Errorf("failed to unmarshal file, err: %v", err)
}
return &job, nil
}
func constructLaunchJobFlagsJob(launchJobFlags *runFlags, req, limit v1.ResourceList) *vcbatch.Job {
return &vcbatch.Job{
ObjectMeta: metav1.ObjectMeta{
Name: launchJobFlags.Name,
Namespace: launchJobFlags.Namespace,
},
Spec: vcbatch.JobSpec{
MinAvailable: int32(launchJobFlags.MinAvailable),
SchedulerName: launchJobFlags.SchedulerName,
Tasks: []vcbatch.TaskSpec{
{
Replicas: int32(launchJobFlags.Replicas),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Name: launchJobFlags.Name,
Labels: map[string]string{jobName: launchJobFlags.Name},
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: launchJobFlags.Image,
Name: launchJobFlags.Name,
ImagePullPolicy: v1.PullIfNotPresent,
Resources: v1.ResourceRequirements{
Limits: limit,
Requests: req,
},
},
},
},
},
},
},
},
}
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
"github.com/spf13/cobra"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/volcano/pkg/cli/util"
)
type suspendFlags struct {
commonFlags
Namespace string
JobName string
}
var suspendJobFlags = &suspendFlags{}
// InitSuspendFlags init suspend related flags.
func InitSuspendFlags(cmd *cobra.Command) {
initFlags(cmd, &suspendJobFlags.commonFlags)
cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "N", "", "the name of job")
}
// SuspendJob suspends the job.
func SuspendJob() error {
config, err := util.BuildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
if err != nil {
return err
}
if suspendJobFlags.JobName == "" {
err := fmt.Errorf("job name is mandatory to suspend a particular job")
return err
}
return createJobCommand(config,
suspendJobFlags.Namespace, suspendJobFlags.JobName,
v1alpha1.AbortJobAction)
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"os"
"strings"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/rest"
vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
func homeDir() string {
if h := os.Getenv("HOME"); h != "" {
return h
}
return os.Getenv("USERPROFILE") // windows
}
// populateResourceListV1 takes strings of form <resourceName1>=<value1>,<resourceName1>=<value2>
// and returns ResourceList.
func populateResourceListV1(spec string) (v1.ResourceList, error) {
// empty input gets a nil response to preserve generator test expected behaviors
if spec == "" {
return nil, nil
}
result := v1.ResourceList{}
resourceStatements := strings.Split(spec, ",")
for _, resourceStatement := range resourceStatements {
parts := strings.Split(resourceStatement, "=")
if len(parts) != 2 {
return nil, fmt.Errorf("invalid argument syntax %v, expected <resource>=<value>", resourceStatement)
}
resourceName := v1.ResourceName(parts[0])
resourceQuantity, err := resource.ParseQuantity(parts[1])
if err != nil {
return nil, err
}
result[resourceName] = resourceQuantity
}
return result, nil
}
func createJobCommand(config *rest.Config, ns, name string, action vcbus.Action) error {
jobClient := versioned.NewForConfigOrDie(config)
job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
cmd := &vcbus.Command{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-%s-",
job.Name, strings.ToLower(string(action))),
Namespace: job.Namespace,
OwnerReferences: []metav1.OwnerReference{
*ctrlRef,
},
},
TargetObject: ctrlRef,
Action: string(action),
}
if _, err := jobClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
return err
}
return nil
}
func translateTimestampSince(timestamp metav1.Time) string {
if timestamp.IsZero() {
return "<unknown>"
}
return HumanDuration(time.Since(timestamp.Time))
}
// HumanDuration translate time.Duration to human readable time string.
func HumanDuration(d time.Duration) string {
// Allow deviation no more than 2 seconds(excluded) to tolerate machine time
// inconsistence, it can be considered as almost now.
if seconds := int(d.Seconds()); seconds < -1 {
return "<invalid>"
} else if seconds < 0 {
return "0s"
} else if seconds < 60*2 {
return fmt.Sprintf("%ds", seconds)
}
minutes := int(d / time.Minute)
if minutes < 10 {
s := int(d/time.Second) % 60
if s == 0 {
return fmt.Sprintf("%dm", minutes)
}
return fmt.Sprintf("%dm%ds", minutes, s)
} else if minutes < 60*3 {
return fmt.Sprintf("%dm", minutes)
}
hours := int(d / time.Hour)
if hours < 8 {
m := int(d/time.Minute) % 60
if m == 0 {
return fmt.Sprintf("%dh", hours)
}
return fmt.Sprintf("%dh%dm", hours, m)
} else if hours < 48 {
return fmt.Sprintf("%dh", hours)
} else if hours < 24*8 {
h := hours % 24
if h == 0 {
return fmt.Sprintf("%dd", hours/24)
}
return fmt.Sprintf("%dd%dh", hours/24, h)
} else if hours < 24*365*2 {
return fmt.Sprintf("%dd", hours/24)
} else if hours < 24*365*8 {
return fmt.Sprintf("%dy%dd", hours/24/365, (hours/24)%365)
}
return fmt.Sprintf("%dy", hours/24/365)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"encoding/json"
"fmt"
"io"
"os"
"strings"
"github.com/spf13/cobra"
coreV1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/volcano/pkg/cli/util"
)
type viewFlags struct {
commonFlags
Namespace string
JobName string
}
// level of print indent.
const (
Level0 = iota
Level1
Level2
)
var viewJobFlags = &viewFlags{}
// InitViewFlags init the view command flags.
func InitViewFlags(cmd *cobra.Command) {
initFlags(cmd, &viewJobFlags.commonFlags)
cmd.Flags().StringVarP(&viewJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
cmd.Flags().StringVarP(&viewJobFlags.JobName, "name", "N", "", "the name of job")
}
// ViewJob gives full details of the job.
func ViewJob() error {
config, err := util.BuildConfig(viewJobFlags.Master, viewJobFlags.Kubeconfig)
if err != nil {
return err
}
if viewJobFlags.JobName == "" {
err := fmt.Errorf("job name (specified by --name or -N) is mandatory to view a particular job")
return err
}
jobClient := versioned.NewForConfigOrDie(config)
job, err := jobClient.BatchV1alpha1().Jobs(viewJobFlags.Namespace).Get(context.TODO(), viewJobFlags.JobName, metav1.GetOptions{})
if err != nil {
return err
}
if job == nil {
fmt.Printf("No resources found\n")
return nil
}
PrintJobInfo(job, os.Stdout)
PrintEvents(GetEvents(config, job), os.Stdout)
return nil
}
// PrintJobInfo print the job detailed info into writer.
func PrintJobInfo(job *v1alpha1.Job, writer io.Writer) {
WriteLine(writer, Level0, "Name: \t%s\n", job.Name)
WriteLine(writer, Level0, "Namespace: \t%s\n", job.Namespace)
if len(job.Labels) > 0 {
label, _ := json.Marshal(job.Labels)
WriteLine(writer, Level0, "Labels: \t%s\n", string(label))
} else {
WriteLine(writer, Level0, "Labels: \t<none>\n")
}
if len(job.Annotations) > 0 {
annotation, _ := json.Marshal(job.Annotations)
WriteLine(writer, Level0, "Annotations:\t%s\n", string(annotation))
} else {
WriteLine(writer, Level0, "Annotations:\t<none>\n")
}
WriteLine(writer, Level0, "API Version:\t%s\n", job.APIVersion)
WriteLine(writer, Level0, "Kind: \t%s\n", job.Kind)
WriteLine(writer, Level0, "Metadata:\n")
WriteLine(writer, Level1, "Creation Timestamp:\t%s\n", job.CreationTimestamp)
WriteLine(writer, Level1, "Generate Name: \t%s\n", job.GenerateName)
WriteLine(writer, Level1, "Generation: \t%d\n", job.Generation)
WriteLine(writer, Level1, "Resource Version: \t%s\n", job.ResourceVersion)
WriteLine(writer, Level1, "Self Link: \t%s\n", job.SelfLink)
WriteLine(writer, Level1, "UID: \t%s\n", job.UID)
WriteLine(writer, Level0, "Spec:\n")
WriteLine(writer, Level1, "Min Available: \t%d\n", job.Spec.MinAvailable)
WriteLine(writer, Level1, "Plugins:\n")
WriteLine(writer, Level2, "Env:\t%v\n", job.Spec.Plugins["env"])
WriteLine(writer, Level2, "Ssh:\t%v\n", job.Spec.Plugins["ssh"])
WriteLine(writer, Level1, "Scheduler Name: \t%s\n", job.Spec.SchedulerName)
WriteLine(writer, Level1, "Tasks:\n")
for i := 0; i < len(job.Spec.Tasks); i++ {
WriteLine(writer, Level2, "Name:\t%s\n", job.Spec.Tasks[i].Name)
WriteLine(writer, Level2, "Replicas:\t%d\n", job.Spec.Tasks[i].Replicas)
WriteLine(writer, Level2, "Template:\n")
WriteLine(writer, Level2+1, "Metadata:\n")
WriteLine(writer, Level2+2, "Annotations:\n")
WriteLine(writer, Level2+3, "Cri . Cci . Io / Container - Type: \t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["cri.cci.io/container-type"])
WriteLine(writer, Level2+3, "Kubernetes . Io / Availablezone: \t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["kubernetes.io/availablezone"])
WriteLine(writer, Level2+3, "Network . Alpha . Kubernetes . Io / Network:\t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["network.alpha.kubernetes.io/network"])
WriteLine(writer, Level2+2, "Creation Timestamp:\t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.CreationTimestamp)
WriteLine(writer, Level2+1, "Spec:\n")
WriteLine(writer, Level2+2, "Containers:\n")
for j := 0; j < len(job.Spec.Tasks[i].Template.Spec.Containers); j++ {
WriteLine(writer, Level2+3, "Command:\n")
for k := 0; k < len(job.Spec.Tasks[i].Template.Spec.Containers[j].Command); k++ {
WriteLine(writer, Level2+4, "%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Command[k])
}
WriteLine(writer, Level2+3, "Image:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Image)
WriteLine(writer, Level2+3, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Name)
WriteLine(writer, Level2+3, "Ports:\n")
for k := 0; k < len(job.Spec.Tasks[i].Template.Spec.Containers[j].Ports); k++ {
WriteLine(writer, Level2+4, "Container Port:\t%d\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Ports[k].ContainerPort)
WriteLine(writer, Level2+4, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Ports[k].Name)
}
WriteLine(writer, Level2+3, "Resources:\n")
WriteLine(writer, Level2+4, "Limits:\n")
WriteLine(writer, Level2+5, "Cpu: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Limits.Cpu())
WriteLine(writer, Level2+5, "Memory:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Limits.Memory())
WriteLine(writer, Level2+4, "Requests:\n")
WriteLine(writer, Level2+5, "Cpu: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Requests.Cpu())
WriteLine(writer, Level2+5, "Memory:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Requests.Memory())
WriteLine(writer, Level2+4, "Working Dir:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].WorkingDir)
}
WriteLine(writer, Level2+2, "Image Pull Secrets:\n")
for j := 0; j < len(job.Spec.Tasks[i].Template.Spec.ImagePullSecrets); j++ {
WriteLine(writer, Level2+3, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.ImagePullSecrets[j].Name)
}
WriteLine(writer, Level2+2, "Restart Policy: \t%s\n", job.Spec.Tasks[i].Template.Spec.RestartPolicy)
}
WriteLine(writer, Level0, "Status:\n")
if job.Status.Succeeded > 0 {
WriteLine(writer, Level1, "Succeeded: \t%d\n", job.Status.Succeeded)
}
if job.Status.Pending > 0 {
WriteLine(writer, Level1, "Pending: \t%d\n", job.Status.Pending)
}
if job.Status.Running > 0 {
WriteLine(writer, Level1, "Running: \t%d\n", job.Status.Running)
}
if job.Status.Failed > 0 {
WriteLine(writer, Level1, "Failed: \t%d\n", job.Status.Failed)
}
if job.Status.Terminating > 0 {
WriteLine(writer, Level1, "Terminating: \t%d\n", job.Status.Terminating)
}
if job.Status.Unknown > 0 {
WriteLine(writer, Level1, "Unknown: \t%d\n", job.Status.Unknown)
}
if job.Status.RetryCount > 0 {
WriteLine(writer, Level1, "RetryCount: \t%d\n", job.Status.RetryCount)
}
if job.Status.MinAvailable > 0 {
WriteLine(writer, Level1, "Min Available:\t%d\n", job.Status.MinAvailable)
}
if job.Status.Version > 0 {
WriteLine(writer, Level1, "Version: \t%d\n", job.Status.Version)
}
WriteLine(writer, Level1, "State:\n")
WriteLine(writer, Level2, "Phase:\t%s\n", job.Status.State.Phase)
if len(job.Status.ControlledResources) > 0 {
WriteLine(writer, Level1, "Controlled Resources:\n")
for key, value := range job.Status.ControlledResources {
WriteLine(writer, Level2, "%s: \t%s\n", key, value)
}
}
if len(job.Status.Conditions) > 0 {
WriteLine(writer, Level1, "Conditions:\n Status\tTransitionTime\n")
for _, c := range job.Status.Conditions {
WriteLine(writer, Level2, "%v \t%v \n",
c.Status,
c.LastTransitionTime)
}
}
}
// PrintEvents print event info to writer.
func PrintEvents(events []coreV1.Event, writer io.Writer) {
if len(events) > 0 {
WriteLine(writer, Level0, "%s:\n%-15s\t%-40s\t%-30s\t%-40s\t%s\n", "Events", "Type", "Reason", "Age", "Form", "Message")
WriteLine(writer, Level0, "%-15s\t%-40s\t%-30s\t%-40s\t%s\n", "-------", "-------", "-------", "-------", "-------")
for _, e := range events {
var interval string
if e.Count > 1 {
interval = fmt.Sprintf("%s (x%d over %s)", translateTimestampSince(e.LastTimestamp), e.Count, translateTimestampSince(e.FirstTimestamp))
} else {
interval = translateTimestampSince(e.FirstTimestamp)
}
EventSourceString := []string{e.Source.Component}
if len(e.Source.Host) > 0 {
EventSourceString = append(EventSourceString, e.Source.Host)
}
WriteLine(writer, Level0, "%-15v\t%-40v\t%-30s\t%-40s\t%v\n",
e.Type,
e.Reason,
interval,
strings.Join(EventSourceString, ", "),
strings.TrimSpace(e.Message),
)
}
} else {
WriteLine(writer, Level0, "Events: \t<none>\n")
}
}
// GetEvents get the job event by config.
func GetEvents(config *rest.Config, job *v1alpha1.Job) []coreV1.Event {
kubernetes, err := kubernetes.NewForConfig(config)
if err != nil {
fmt.Printf("%v\n", err)
return nil
}
events, _ := kubernetes.CoreV1().Events(viewJobFlags.Namespace).List(context.TODO(), metav1.ListOptions{})
var jobEvents []coreV1.Event
for _, v := range events.Items {
if strings.HasPrefix(v.ObjectMeta.Name, job.Name+".") {
jobEvents = append(jobEvents, v)
}
}
return jobEvents
}
// WriteLine write lines with specified indent.
func WriteLine(writer io.Writer, spaces int, content string, params ...interface{}) {
prefix := ""
for i := 0; i < spaces; i++ {
prefix += " "
}
fmt.Fprintf(writer, prefix+content, params...)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"os"
"path/filepath"
"github.com/spf13/cobra"
)
type commonFlags struct {
Master string
Kubeconfig string
SchedulerName string
}
func initFlags(cmd *cobra.Command, cf *commonFlags) {
cmd.Flags().StringVarP(&cf.SchedulerName, "scheduler", "", "volcano", "the scheduler for this job")
cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
kubeConfFile := os.Getenv("KUBECONFIG")
if kubeConfFile == "" {
if home := homeDir(); home != "" {
kubeConfFile = filepath.Join(home, ".kube", "config")
}
}
cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
type createFlags struct {
commonFlags
Name string
Weight int32
// State is state of Queue
State string
}
var createQueueFlags = &createFlags{}
// InitCreateFlags is used to init all flags during queue creating.
func InitCreateFlags(cmd *cobra.Command) {
initFlags(cmd, &createQueueFlags.commonFlags)
cmd.Flags().StringVarP(&createQueueFlags.Name, "name", "n", "test", "the name of queue")
cmd.Flags().Int32VarP(&createQueueFlags.Weight, "weight", "w", 1, "the weight of the queue")
cmd.Flags().StringVarP(&createQueueFlags.State, "state", "S", "Open", "the state of queue")
}
// CreateQueue create queue.
func CreateQueue() error {
config, err := buildConfig(createQueueFlags.Master, createQueueFlags.Kubeconfig)
if err != nil {
return err
}
queue := &schedulingv1beta1.Queue{
ObjectMeta: metav1.ObjectMeta{
Name: createQueueFlags.Name,
},
Spec: schedulingv1beta1.QueueSpec{
Weight: createQueueFlags.Weight,
},
Status: schedulingv1beta1.QueueStatus{
State: schedulingv1beta1.QueueState(createQueueFlags.State),
},
}
queueClient := versioned.NewForConfigOrDie(config)
if _, err := queueClient.SchedulingV1beta1().Queues().Create(context.TODO(), queue, metav1.CreateOptions{}); err != nil {
return err
}
return nil
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"volcano.sh/apis/pkg/client/clientset/versioned"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
type deleteFlags struct {
commonFlags
// Name is name of queue
Name string
}
var deleteQueueFlags = &deleteFlags{}
// InitDeleteFlags is used to init all flags during queue deleting.
func InitDeleteFlags(cmd *cobra.Command) {
initFlags(cmd, &deleteQueueFlags.commonFlags)
cmd.Flags().StringVarP(&deleteQueueFlags.Name, "name", "n", "", "the name of queue")
}
// DeleteQueue delete queue.
func DeleteQueue() error {
config, err := buildConfig(deleteQueueFlags.Master, deleteQueueFlags.Kubeconfig)
if err != nil {
return err
}
if len(deleteQueueFlags.Name) == 0 {
return fmt.Errorf("queue name must be specified")
}
queueClient := versioned.NewForConfigOrDie(config)
return queueClient.SchedulingV1beta1().Queues().Delete(context.TODO(), deleteQueueFlags.Name, metav1.DeleteOptions{})
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"io"
"os"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
type getFlags struct {
commonFlags
Name string
}
var getQueueFlags = &getFlags{}
// InitGetFlags is used to init all flags.
func InitGetFlags(cmd *cobra.Command) {
initFlags(cmd, &getQueueFlags.commonFlags)
cmd.Flags().StringVarP(&getQueueFlags.Name, "name", "n", "", "the name of queue")
}
// GetQueue gets a queue.
func GetQueue() error {
config, err := buildConfig(getQueueFlags.Master, getQueueFlags.Kubeconfig)
if err != nil {
return err
}
if getQueueFlags.Name == "" {
err := fmt.Errorf("name is mandatory to get the particular queue details")
return err
}
queueClient := versioned.NewForConfigOrDie(config)
queue, err := queueClient.SchedulingV1beta1().Queues().Get(context.TODO(), getQueueFlags.Name, metav1.GetOptions{})
if err != nil {
return err
}
PrintQueue(queue, os.Stdout)
return nil
}
// PrintQueue prints queue information.
func PrintQueue(queue *v1beta1.Queue, writer io.Writer) {
_, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
Name, Weight, State, Inqueue, Pending, Running, Unknown)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
_, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"io"
"os"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
type listFlags struct {
commonFlags
}
const (
// Weight of the queue
Weight string = "Weight"
// Name of queue
Name string = "Name"
// Pending status of the queue
Pending string = "Pending"
// Running status of the queue
Running string = "Running"
// Unknown status of the queue
Unknown string = "Unknown"
// Inqueue status of queue
Inqueue string = "Inqueue"
// State is state of queue
State string = "State"
)
var listQueueFlags = &listFlags{}
// InitListFlags inits all flags.
func InitListFlags(cmd *cobra.Command) {
initFlags(cmd, &listQueueFlags.commonFlags)
}
// ListQueue lists all the queue.
func ListQueue() error {
config, err := buildConfig(listQueueFlags.Master, listQueueFlags.Kubeconfig)
if err != nil {
return err
}
jobClient := versioned.NewForConfigOrDie(config)
queues, err := jobClient.SchedulingV1beta1().Queues().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return err
}
if len(queues.Items) == 0 {
fmt.Printf("No resources found\n")
return nil
}
PrintQueues(queues, os.Stdout)
return nil
}
// PrintQueues prints queue information.
func PrintQueues(queues *v1beta1.QueueList, writer io.Writer) {
_, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
Name, Weight, State, Inqueue, Pending, Running, Unknown)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
for _, queue := range queues.Items {
_, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
}
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
const (
// ActionOpen is `open` action
ActionOpen = "open"
// ActionClose is `close` action
ActionClose = "close"
// ActionUpdate is `update` action
ActionUpdate = "update"
)
type operateFlags struct {
commonFlags
// Name is name of queue
Name string
// Weight is weight of queue
Weight int32
// Action is operation action of queue
Action string
}
var operateQueueFlags = &operateFlags{}
// InitOperateFlags is used to init all flags during queue operating
func InitOperateFlags(cmd *cobra.Command) {
initFlags(cmd, &operateQueueFlags.commonFlags)
cmd.Flags().StringVarP(&operateQueueFlags.Name, "name", "n", "", "the name of queue")
cmd.Flags().Int32VarP(&operateQueueFlags.Weight, "weight", "w", 0, "the weight of the queue")
cmd.Flags().StringVarP(&operateQueueFlags.Action, "action", "a", "",
"operate action to queue, valid actions are open, close, update")
}
// OperateQueue operates queue
func OperateQueue() error {
config, err := buildConfig(operateQueueFlags.Master, operateQueueFlags.Kubeconfig)
if err != nil {
return err
}
if len(operateQueueFlags.Name) == 0 {
return fmt.Errorf("queue name must be specified")
}
var action v1alpha1.Action
switch operateQueueFlags.Action {
case ActionOpen:
action = v1alpha1.OpenQueueAction
case ActionClose:
action = v1alpha1.CloseQueueAction
case ActionUpdate:
if operateQueueFlags.Weight == 0 {
return fmt.Errorf("when %s queue %s, weight must be specified, "+
"the value must be greater than 0", ActionUpdate, operateQueueFlags.Name)
}
queueClient := versioned.NewForConfigOrDie(config)
patchBytes := []byte(fmt.Sprintf(`{"spec":{"weight":%d}}`, operateQueueFlags.Weight))
_, err := queueClient.SchedulingV1beta1().Queues().Patch(context.TODO(),
operateQueueFlags.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{})
return err
case "":
return fmt.Errorf("action can not be null")
default:
return fmt.Errorf("action %s invalid, valid actions are %s, %s and %s",
operateQueueFlags.Action, ActionOpen, ActionClose, ActionUpdate)
}
return createQueueCommand(config, action)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"os"
"strings"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
// Initialize client auth plugin.
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
func homeDir() string {
if h := os.Getenv("HOME"); h != "" {
return h
}
return os.Getenv("USERPROFILE") // windows
}
func buildConfig(master, kubeconfig string) (*rest.Config, error) {
return clientcmd.BuildConfigFromFlags(master, kubeconfig)
}
func createQueueCommand(config *rest.Config, action busv1alpha1.Action) error {
queueClient := versioned.NewForConfigOrDie(config)
queue, err := queueClient.SchedulingV1beta1().Queues().Get(context.TODO(), operateQueueFlags.Name, metav1.GetOptions{})
if err != nil {
return err
}
ctrlRef := metav1.NewControllerRef(queue, helpers.V1beta1QueueKind)
cmd := &busv1alpha1.Command{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-%s-",
queue.Name, strings.ToLower(string(action))),
OwnerReferences: []metav1.OwnerReference{
*ctrlRef,
},
},
TargetObject: ctrlRef,
Action: string(action),
}
if _, err := queueClient.BusV1alpha1().Commands("default").Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
return err
}
return nil
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
"volcano.sh/apis/pkg/client/clientset/versioned"
)
// CommonFlags are the flags that most command lines have.
type CommonFlags struct {
Master string
Kubeconfig string
}
// InitFlags initializes the common flags for most command lines.
func InitFlags(cmd *cobra.Command, cf *CommonFlags) {
cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
kubeConfFile := os.Getenv("KUBECONFIG")
if kubeConfFile == "" {
if home := HomeDir(); home != "" {
kubeConfFile = filepath.Join(home, ".kube", "config")
}
}
cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
}
// HomeDir gets the env $HOME.
func HomeDir() string {
if h := os.Getenv("HOME"); h != "" {
return h
}
return os.Getenv("USERPROFILE") // windows
}
// BuildConfig builds the configure file for command lines.
func BuildConfig(master, kubeconfig string) (*rest.Config, error) {
return clientcmd.BuildConfigFromFlags(master, kubeconfig)
}
// PopulateResourceListV1 takes strings of form <resourceName1>=<value1>,<resourceName1>=<value2> and returns ResourceList.
func PopulateResourceListV1(spec string) (v1.ResourceList, error) {
// empty input gets a nil response to preserve generator test expected behaviors
if spec == "" {
return nil, nil
}
result := v1.ResourceList{}
resourceStatements := strings.Split(spec, ",")
for _, resourceStatement := range resourceStatements {
parts := strings.Split(resourceStatement, "=")
if len(parts) != 2 {
return nil, fmt.Errorf("invalid argument syntax %v, expected <resource>=<value>", resourceStatement)
}
resourceName := v1.ResourceName(parts[0])
resourceQuantity, err := resource.ParseQuantity(parts[1])
if err != nil {
return nil, err
}
result[resourceName] = resourceQuantity
}
return result, nil
}
// CreateQueueCommand executes a command such as open/close
func CreateQueueCommand(vcClient *versioned.Clientset, ns, name string, action vcbus.Action) error {
queue, err := vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
ctrlRef := metav1.NewControllerRef(queue, helpers.V1beta1QueueKind)
cmd := &vcbus.Command{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-%s-",
queue.Name, strings.ToLower(string(action))),
Namespace: queue.Namespace,
OwnerReferences: []metav1.OwnerReference{
*ctrlRef,
},
},
TargetObject: ctrlRef,
Action: string(action),
}
if _, err := vcClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
return err
}
return nil
}
// CreateJobCommand executes a command such as resume/suspend.
func CreateJobCommand(config *rest.Config, ns, name string, action vcbus.Action) error {
jobClient := versioned.NewForConfigOrDie(config)
job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
cmd := &vcbus.Command{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-%s-",
job.Name, strings.ToLower(string(action))),
Namespace: job.Namespace,
OwnerReferences: []metav1.OwnerReference{
*ctrlRef,
},
},
TargetObject: ctrlRef,
Action: string(action),
}
if _, err := jobClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
return err
}
return nil
}
// TranslateTimestampSince translates the time stamp.
func TranslateTimestampSince(timestamp metav1.Time) string {
if timestamp.IsZero() {
return "<unknown>"
}
return HumanDuration(time.Since(timestamp.Time))
}
// HumanDuration translate time.Duration to human readable time string.
func HumanDuration(d time.Duration) string {
// Allow deviation no more than 2 seconds(excluded) to tolerate machine time
// inconsistence, it can be considered as almost now.
if seconds := int(d.Seconds()); seconds < -1 {
return "<invalid>"
} else if seconds < 0 {
return "0s"
} else if seconds < 60*2 {
return fmt.Sprintf("%ds", seconds)
}
minutes := int(d / time.Minute)
if minutes < 10 {
s := int(d/time.Second) % 60
if s == 0 {
return fmt.Sprintf("%dm", minutes)
}
return fmt.Sprintf("%dm%ds", minutes, s)
} else if minutes < 60*3 {
return fmt.Sprintf("%dm", minutes)
}
hours := int(d / time.Hour)
if hours < 8 {
m := int(d/time.Minute) % 60
if m == 0 {
return fmt.Sprintf("%dh", hours)
}
return fmt.Sprintf("%dh%dm", hours, m)
} else if hours < 48 {
return fmt.Sprintf("%dh", hours)
} else if hours < 24*8 {
h := hours % 24
if h == 0 {
return fmt.Sprintf("%dd", hours/24)
}
return fmt.Sprintf("%dd%dh", hours/24, h)
} else if hours < 24*365*2 {
return fmt.Sprintf("%dd", hours/24)
} else if hours < 24*365*8 {
return fmt.Sprintf("%dy%dd", hours/24/365, (hours/24)%365)
}
return fmt.Sprintf("%dy", hours/24/365)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package vcancel
import (
"context"
"fmt"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/volcano/pkg/cli/util"
)
type cancelFlags struct {
util.CommonFlags
Namespace string
JobName string
}
var cancelJobFlags = &cancelFlags{}
// InitCancelFlags init the cancel command flags.
func InitCancelFlags(cmd *cobra.Command) {
util.InitFlags(cmd, &cancelJobFlags.CommonFlags)
cmd.Flags().StringVarP(&cancelJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
cmd.Flags().StringVarP(&cancelJobFlags.JobName, "name", "n", "", "the name of job")
}
// CancelJob cancel the job.
func CancelJob() error {
config, err := util.BuildConfig(cancelJobFlags.Master, cancelJobFlags.Kubeconfig)
if err != nil {
return err
}
if cancelJobFlags.JobName == "" {
err := fmt.Errorf("job name is mandatory to cancel a particular job")
return err
}
jobClient := versioned.NewForConfigOrDie(config)
err = jobClient.BatchV1alpha1().Jobs(cancelJobFlags.Namespace).Delete(context.TODO(), cancelJobFlags.JobName, metav1.DeleteOptions{})
if err != nil {
return err
}
fmt.Printf("cancel job %v successfully\n", cancelJobFlags.JobName)
return nil
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package vresume
import (
"fmt"
"github.com/spf13/cobra"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/volcano/pkg/cli/util"
)
type resumeFlags struct {
util.CommonFlags
Namespace string
JobName string
}
var resumeJobFlags = &resumeFlags{}
// InitResumeFlags init resume command flags.
func InitResumeFlags(cmd *cobra.Command) {
util.InitFlags(cmd, &resumeJobFlags.CommonFlags)
cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "n", "", "the name of job")
}
// ResumeJob resumes the job.
func ResumeJob() error {
config, err := util.BuildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
if err != nil {
return err
}
if resumeJobFlags.JobName == "" {
err := fmt.Errorf("job name is mandatory to resume a particular job")
return err
}
return util.CreateJobCommand(config,
resumeJobFlags.Namespace, resumeJobFlags.JobName,
v1alpha1.ResumeJobAction)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package vsuspend
import (
"fmt"
"github.com/spf13/cobra"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/volcano/pkg/cli/util"
)
type suspendFlags struct {
util.CommonFlags
Namespace string
JobName string
}
var suspendJobFlags = &suspendFlags{}
// InitSuspendFlags init suspend related flags.
func InitSuspendFlags(cmd *cobra.Command) {
util.InitFlags(cmd, &suspendJobFlags.CommonFlags)
cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "n", "", "the name of job")
}
// SuspendJob suspends the job.
func SuspendJob() error {
config, err := util.BuildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
if err != nil {
return err
}
if suspendJobFlags.JobName == "" {
err := fmt.Errorf("job name is mandatory to suspend a particular job")
return err
}
return util.CreateJobCommand(config,
suspendJobFlags.Namespace, suspendJobFlags.JobName,
v1alpha1.AbortJobAction)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package apis
import (
"fmt"
v1 "k8s.io/api/core/v1"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
)
//JobInfo struct.
type JobInfo struct {
Namespace string
Name string
Job *batch.Job
Pods map[string]map[string]*v1.Pod
}
//Clone function clones the k8s pod values to the JobInfo struct.
func (ji *JobInfo) Clone() *JobInfo {
job := &JobInfo{
Namespace: ji.Namespace,
Name: ji.Name,
Job: ji.Job,
Pods: make(map[string]map[string]*v1.Pod),
}
for key, pods := range ji.Pods {
job.Pods[key] = make(map[string]*v1.Pod)
for pn, pod := range pods {
job.Pods[key][pn] = pod
}
}
return job
}
//SetJob sets the volcano jobs values to the JobInfo struct.
func (ji *JobInfo) SetJob(job *batch.Job) {
ji.Name = job.Name
ji.Namespace = job.Namespace
ji.Job = job
}
//AddPod adds the k8s pod object values to the Pods field
//of JobStruct if it doesn't exist. Otherwise it throws error.
func (ji *JobInfo) AddPod(pod *v1.Pod) error {
taskName, found := pod.Annotations[batch.TaskSpecKey]
if !found {
return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
pod.Namespace, pod.Name)
}
_, found = pod.Annotations[batch.JobVersion]
if !found {
return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
pod.Namespace, pod.Name)
}
if _, found := ji.Pods[taskName]; !found {
ji.Pods[taskName] = make(map[string]*v1.Pod)
}
if _, found := ji.Pods[taskName][pod.Name]; found {
return fmt.Errorf("duplicated pod")
}
ji.Pods[taskName][pod.Name] = pod
return nil
}
//UpdatePod updates the k8s pod object values to the existing pod.
func (ji *JobInfo) UpdatePod(pod *v1.Pod) error {
taskName, found := pod.Annotations[batch.TaskSpecKey]
if !found {
return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
pod.Namespace, pod.Name)
}
_, found = pod.Annotations[batch.JobVersion]
if !found {
return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
pod.Namespace, pod.Name)
}
if _, found := ji.Pods[taskName]; !found {
return fmt.Errorf("can not find task %s in cache", taskName)
}
if _, found := ji.Pods[taskName][pod.Name]; !found {
return fmt.Errorf("can not find pod <%s/%s> in cache",
pod.Namespace, pod.Name)
}
ji.Pods[taskName][pod.Name] = pod
return nil
}
//DeletePod deletes the given k8s pod from the JobInfo struct.
func (ji *JobInfo) DeletePod(pod *v1.Pod) error {
taskName, found := pod.Annotations[batch.TaskSpecKey]
if !found {
return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
pod.Namespace, pod.Name)
}
_, found = pod.Annotations[batch.JobVersion]
if !found {
return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
pod.Namespace, pod.Name)
}
if pods, found := ji.Pods[taskName]; found {
delete(pods, pod.Name)
if len(pods) == 0 {
delete(ji.Pods, taskName)
}
}
return nil
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package apis
import (
"fmt"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
)
//Request struct.
type Request struct {
Namespace string
JobName string
TaskName string
QueueName string
Event v1alpha1.Event
ExitCode int32
Action v1alpha1.Action
JobVersion int32
}
// String function returns the request in string format.
func (r Request) String() string {
return fmt.Sprintf(
"Queue: %s, Job: %s/%s, Task:%s, Event:%s, ExitCode:%d, Action:%s, JobVersion: %d",
r.QueueName, r.Namespace, r.JobName, r.TaskName, r.Event, r.ExitCode, r.Action, r.JobVersion)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"fmt"
"sync"
"time"
"golang.org/x/time/rate"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/volcano/pkg/controllers/apis"
)
type jobCache struct {
sync.Mutex
jobs map[string]*apis.JobInfo
deletedJobs workqueue.RateLimitingInterface
}
func keyFn(ns, name string) string {
return fmt.Sprintf("%s/%s", ns, name)
}
//JobKeyByName gets the key for the job name.
func JobKeyByName(namespace string, name string) string {
return keyFn(namespace, name)
}
//JobKeyByReq gets the key for the job request.
func JobKeyByReq(req *apis.Request) string {
return keyFn(req.Namespace, req.JobName)
}
//JobKey gets the "ns"/"name" format of the given job.
func JobKey(job *v1alpha1.Job) string {
return keyFn(job.Namespace, job.Name)
}
func jobTerminated(job *apis.JobInfo) bool {
return job.Job == nil && len(job.Pods) == 0
}
func jobKeyOfPod(pod *v1.Pod) (string, error) {
jobName, found := pod.Annotations[v1alpha1.JobNameKey]
if !found {
return "", fmt.Errorf("failed to find job name of pod <%s/%s>",
pod.Namespace, pod.Name)
}
return keyFn(pod.Namespace, jobName), nil
}
// New gets the job Cache.
func New() Cache {
queue := workqueue.NewMaxOfRateLimiter(
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
// 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item)
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
)
return &jobCache{
jobs: map[string]*apis.JobInfo{},
deletedJobs: workqueue.NewRateLimitingQueue(queue),
}
}
func (jc *jobCache) Get(key string) (*apis.JobInfo, error) {
jc.Lock()
defer jc.Unlock()
job, found := jc.jobs[key]
if !found {
return nil, fmt.Errorf("failed to find job <%s>", key)
}
if job.Job == nil {
return nil, fmt.Errorf("job <%s> is not ready", key)
}
return job.Clone(), nil
}
func (jc *jobCache) GetStatus(key string) (*v1alpha1.JobStatus, error) {
jc.Lock()
defer jc.Unlock()
job, found := jc.jobs[key]
if !found {
return nil, fmt.Errorf("failed to find job <%s>", key)
}
if job.Job == nil {
return nil, fmt.Errorf("job <%s> is not ready", key)
}
status := job.Job.Status
return &status, nil
}
func (jc *jobCache) Add(job *v1alpha1.Job) error {
jc.Lock()
defer jc.Unlock()
key := JobKey(job)
if jobInfo, found := jc.jobs[key]; found {
if jobInfo.Job == nil {
jobInfo.SetJob(job)
return nil
}
return fmt.Errorf("duplicated jobInfo <%v>", key)
}
jc.jobs[key] = &apis.JobInfo{
Name: job.Name,
Namespace: job.Namespace,
Job: job,
Pods: make(map[string]map[string]*v1.Pod),
}
return nil
}
func (jc *jobCache) Update(obj *v1alpha1.Job) error {
jc.Lock()
defer jc.Unlock()
key := JobKey(obj)
job, found := jc.jobs[key]
if !found {
return fmt.Errorf("failed to find job <%v>", key)
}
job.Job = obj
return nil
}
func (jc *jobCache) Delete(obj *v1alpha1.Job) error {
jc.Lock()
defer jc.Unlock()
key := JobKey(obj)
jobInfo, found := jc.jobs[key]
if !found {
return fmt.Errorf("failed to find job <%v>", key)
}
jobInfo.Job = nil
jc.deleteJob(jobInfo)
return nil
}
func (jc *jobCache) AddPod(pod *v1.Pod) error {
jc.Lock()
defer jc.Unlock()
key, err := jobKeyOfPod(pod)
if err != nil {
return err
}
job, found := jc.jobs[key]
if !found {
job = &apis.JobInfo{
Pods: make(map[string]map[string]*v1.Pod),
}
jc.jobs[key] = job
}
return job.AddPod(pod)
}
func (jc *jobCache) UpdatePod(pod *v1.Pod) error {
jc.Lock()
defer jc.Unlock()
key, err := jobKeyOfPod(pod)
if err != nil {
return err
}
job, found := jc.jobs[key]
if !found {
job = &apis.JobInfo{
Pods: make(map[string]map[string]*v1.Pod),
}
jc.jobs[key] = job
}
return job.UpdatePod(pod)
}
func (jc *jobCache) DeletePod(pod *v1.Pod) error {
jc.Lock()
defer jc.Unlock()
key, err := jobKeyOfPod(pod)
if err != nil {
return err
}
job, found := jc.jobs[key]
if !found {
job = &apis.JobInfo{
Pods: make(map[string]map[string]*v1.Pod),
}
jc.jobs[key] = job
}
if err := job.DeletePod(pod); err != nil {
return err
}
if jc.jobs[key].Job == nil {
jc.deleteJob(job)
}
return nil
}
func (jc *jobCache) Run(stopCh <-chan struct{}) {
wait.Until(jc.worker, 0, stopCh)
}
func (jc *jobCache) TaskCompleted(jobKey, taskName string) bool {
jc.Lock()
defer jc.Unlock()
var taskReplicas, completed int32
jobInfo, found := jc.jobs[jobKey]
if !found {
return false
}
taskPods, found := jobInfo.Pods[taskName]
if !found {
return false
}
if jobInfo.Job == nil {
return false
}
for _, task := range jobInfo.Job.Spec.Tasks {
if task.Name == taskName {
taskReplicas = task.Replicas
break
}
}
if taskReplicas <= 0 {
return false
}
for _, pod := range taskPods {
if pod.Status.Phase == v1.PodSucceeded {
completed++
}
}
return completed >= taskReplicas
}
func (jc *jobCache) TaskFailed(jobKey, taskName string) bool {
jc.Lock()
defer jc.Unlock()
var taskReplicas, retried, maxRetry int32
jobInfo, found := jc.jobs[jobKey]
if !found {
return false
}
taskPods, found := jobInfo.Pods[taskName]
if !found || jobInfo.Job == nil {
return false
}
for _, task := range jobInfo.Job.Spec.Tasks {
if task.Name == taskName {
maxRetry = task.MaxRetry
taskReplicas = task.Replicas
break
}
}
// maxRetry == -1 means no limit
if taskReplicas == 0 || maxRetry == -1 {
return false
}
// Compatible with existing job
if maxRetry == 0 {
maxRetry = 3
}
for _, pod := range taskPods {
if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodPending {
for j := range pod.Status.InitContainerStatuses {
stat := pod.Status.InitContainerStatuses[j]
retried += stat.RestartCount
}
for j := range pod.Status.ContainerStatuses {
stat := pod.Status.ContainerStatuses[j]
retried += stat.RestartCount
}
}
}
return retried > maxRetry
}
func (jc *jobCache) worker() {
for jc.processCleanupJob() {
}
}
func (jc *jobCache) processCleanupJob() bool {
obj, shutdown := jc.deletedJobs.Get()
if shutdown {
return false
}
defer jc.deletedJobs.Done(obj)
job, ok := obj.(*apis.JobInfo)
if !ok {
klog.Errorf("failed to convert %v to *apis.JobInfo", obj)
return true
}
jc.Mutex.Lock()
defer jc.Mutex.Unlock()
if jobTerminated(job) {
jc.deletedJobs.Forget(obj)
key := keyFn(job.Namespace, job.Name)
delete(jc.jobs, key)
klog.V(3).Infof("Job <%s> was deleted.", key)
} else {
// Retry
jc.deleteJob(job)
}
return true
}
func (jc *jobCache) deleteJob(job *apis.JobInfo) {
klog.V(3).Infof("Try to delete Job <%v/%v>",
job.Namespace, job.Name)
jc.deletedJobs.AddRateLimited(job)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package garbagecollector
import (
"context"
"fmt"
"time"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
batchinformers "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
batchlisters "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
"volcano.sh/volcano/pkg/controllers/framework"
)
func init() {
framework.RegisterController(&gccontroller{})
}
// gccontroller runs reflectors to watch for changes of managed API
// objects. Currently it only watches Jobs. Triggered by Job creation
// and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished`
// to the `queue`. The gccontroller has workers who consume `queue`, check whether
// the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the
// Job to the queue after the TTL is expected to expire; if the TTL has expired, the
// worker will send requests to the API server to delete the Jobs accordingly.
// This is implemented outside of Job controller for separation of concerns, and
// because it will be extended to handle other finishable resource types.
type gccontroller struct {
vcClient vcclientset.Interface
jobInformer batchinformers.JobInformer
// A store of jobs
jobLister batchlisters.JobLister
jobSynced func() bool
// queues that need to be updated.
queue workqueue.RateLimitingInterface
}
func (gc *gccontroller) Name() string {
return "gc-controller"
}
// Initialize creates an instance of gccontroller.
func (gc *gccontroller) Initialize(opt *framework.ControllerOption) error {
gc.vcClient = opt.VolcanoClient
jobInformer := informerfactory.NewSharedInformerFactory(gc.vcClient, 0).Batch().V1alpha1().Jobs()
gc.jobInformer = jobInformer
gc.jobLister = jobInformer.Lister()
gc.jobSynced = jobInformer.Informer().HasSynced
gc.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: gc.addJob,
UpdateFunc: gc.updateJob,
})
return nil
}
// Run starts the worker to clean up Jobs.
func (gc *gccontroller) Run(stopCh <-chan struct{}) {
defer gc.queue.ShutDown()
klog.Infof("Starting garbage collector")
defer klog.Infof("Shutting down garbage collector")
go gc.jobInformer.Informer().Run(stopCh)
if !cache.WaitForCacheSync(stopCh, gc.jobSynced) {
return
}
go wait.Until(gc.worker, time.Second, stopCh)
<-stopCh
}
func (gc *gccontroller) addJob(obj interface{}) {
job := obj.(*v1alpha1.Job)
klog.V(4).Infof("Adding job %s/%s", job.Namespace, job.Name)
if job.DeletionTimestamp == nil && needsCleanup(job) {
gc.enqueue(job)
}
}
func (gc *gccontroller) updateJob(old, cur interface{}) {
job := cur.(*v1alpha1.Job)
klog.V(4).Infof("Updating job %s/%s", job.Namespace, job.Name)
if job.DeletionTimestamp == nil && needsCleanup(job) {
gc.enqueue(job)
}
}
func (gc *gccontroller) enqueue(job *v1alpha1.Job) {
klog.V(4).Infof("Add job %s/%s to cleanup", job.Namespace, job.Name)
key, err := cache.MetaNamespaceKeyFunc(job)
if err != nil {
klog.Errorf("couldn't get key for object %#v: %v", job, err)
return
}
gc.queue.Add(key)
}
func (gc *gccontroller) enqueueAfter(job *v1alpha1.Job, after time.Duration) {
key, err := cache.MetaNamespaceKeyFunc(job)
if err != nil {
klog.Errorf("couldn't get key for object %#v: %v", job, err)
return
}
gc.queue.AddAfter(key, after)
}
func (gc *gccontroller) worker() {
for gc.processNextWorkItem() {
}
}
func (gc *gccontroller) processNextWorkItem() bool {
key, quit := gc.queue.Get()
if quit {
return false
}
defer gc.queue.Done(key)
err := gc.processJob(key.(string))
gc.handleErr(err, key)
return true
}
func (gc *gccontroller) handleErr(err error, key interface{}) {
if err == nil {
gc.queue.Forget(key)
return
}
klog.Errorf("error cleaning up Job %v, will retry: %v", key, err)
gc.queue.AddRateLimited(key)
}
// processJob will check the Job's state and TTL and delete the Job when it
// finishes and its TTL after finished has expired. If the Job hasn't finished or
// its TTL hasn't expired, it will be added to the queue after the TTL is expected
// to expire.
// This function is not meant to be invoked concurrently with the same key.
func (gc *gccontroller) processJob(key string) error {
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
}
klog.V(4).Infof("Checking if Job %s/%s is ready for cleanup", namespace, name)
// Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up.
job, err := gc.jobLister.Jobs(namespace).Get(name)
if errors.IsNotFound(err) {
return nil
}
if err != nil {
return err
}
if expired, err := gc.processTTL(job); err != nil {
return err
} else if !expired {
return nil
}
// The Job's TTL is assumed to have expired, but the Job TTL might be stale.
// Before deleting the Job, do a final sanity check.
// If TTL is modified before we do this check, we cannot be sure if the TTL truly expires.
// The latest Job may have a different UID, but it's fine because the checks will be run again.
fresh, err := gc.vcClient.BatchV1alpha1().Jobs(namespace).Get(context.TODO(), name, metav1.GetOptions{})
if errors.IsNotFound(err) {
return nil
}
if err != nil {
return err
}
// Use the latest Job TTL to see if the TTL truly expires.
if expired, err := gc.processTTL(fresh); err != nil {
return err
} else if !expired {
return nil
}
// Cascade deletes the Jobs if TTL truly expires.
policy := metav1.DeletePropagationForeground
options := metav1.DeleteOptions{
PropagationPolicy: &policy,
Preconditions: &metav1.Preconditions{UID: &fresh.UID},
}
klog.V(4).Infof("Cleaning up Job %s/%s", namespace, name)
return gc.vcClient.BatchV1alpha1().Jobs(fresh.Namespace).Delete(context.TODO(), fresh.Name, options)
}
// processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire
// if the TTL will expire later.
func (gc *gccontroller) processTTL(job *v1alpha1.Job) (expired bool, err error) {
// We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up.
if job.DeletionTimestamp != nil || !needsCleanup(job) {
return false, nil
}
now := time.Now()
t, err := timeLeft(job, &now)
if err != nil {
return false, err
}
// TTL has expired
if *t <= 0 {
return true, nil
}
gc.enqueueAfter(job, *t)
return false, nil
}
// needsCleanup checks whether a Job has finished and has a TTL set.
func needsCleanup(j *v1alpha1.Job) bool {
return j.Spec.TTLSecondsAfterFinished != nil && isJobFinished(j)
}
func isJobFinished(job *v1alpha1.Job) bool {
return job.Status.State.Phase == v1alpha1.Completed ||
job.Status.State.Phase == v1alpha1.Failed ||
job.Status.State.Phase == v1alpha1.Terminated
}
func getFinishAndExpireTime(j *v1alpha1.Job) (*time.Time, *time.Time, error) {
if !needsCleanup(j) {
return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name)
}
finishAt, err := jobFinishTime(j)
if err != nil {
return nil, nil, err
}
finishAtUTC := finishAt.UTC()
expireAtUTC := finishAtUTC.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second)
return &finishAtUTC, &expireAtUTC, nil
}
func timeLeft(j *v1alpha1.Job, since *time.Time) (*time.Duration, error) {
finishAt, expireAt, err := getFinishAndExpireTime(j)
if err != nil {
return nil, err
}
if finishAt.UTC().After(since.UTC()) {
klog.Warningf("Warning: Found Job %s/%s finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", j.Namespace, j.Name)
}
remaining := expireAt.UTC().Sub(since.UTC())
klog.V(4).Infof("Found Job %s/%s finished at %v, remaining TTL %v since %v, TTL will expire at %v", j.Namespace, j.Name, finishAt.UTC(), remaining, since.UTC(), expireAt.UTC())
return &remaining, nil
}
// jobFinishTime takes an already finished Job and returns the time it finishes.
func jobFinishTime(finishedJob *v1alpha1.Job) (metav1.Time, error) {
if finishedJob.Status.State.LastTransitionTime.IsZero() {
return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name)
}
return finishedJob.Status.State.LastTransitionTime, nil
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helpers
import (
"fmt"
"math/rand"
"strconv"
"strings"
"time"
v1 "k8s.io/api/core/v1"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/volcano/pkg/controllers/apis"
"volcano.sh/volcano/pkg/scheduler/api"
)
const (
// PodNameFmt pod name format
PodNameFmt = "%s-%s-%d"
// persistentVolumeClaimFmt represents persistent volume claim name format
persistentVolumeClaimFmt = "%s-pvc-%s"
)
// GetPodIndexUnderTask returns task Index.
func GetPodIndexUnderTask(pod *v1.Pod) string {
num := strings.Split(pod.Name, "-")
if len(num) >= 3 {
return num[len(num)-1]
}
return ""
}
// ComparePodByIndex by pod index
func CompareTask(lv, rv *api.TaskInfo) bool {
lStr := GetPodIndexUnderTask(lv.Pod)
rStr := GetPodIndexUnderTask(rv.Pod)
lIndex, lErr := strconv.Atoi(lStr)
rIndex, rErr := strconv.Atoi(rStr)
if lErr != nil || rErr != nil || lIndex == rIndex {
return lv.Pod.CreationTimestamp.Before(&rv.Pod.CreationTimestamp)
}
if lIndex > rIndex {
return false
}
return true
}
// GetTaskKey returns task key/name
func GetTaskKey(pod *v1.Pod) string {
if pod.Annotations == nil || pod.Annotations[batch.TaskSpecKey] == "" {
return batch.DefaultTaskSpec
}
return pod.Annotations[batch.TaskSpecKey]
}
// GetTaskSpec returns task spec
func GetTaskSpec(job *batch.Job, taskName string) (batch.TaskSpec, bool) {
for _, ts := range job.Spec.Tasks {
if ts.Name == taskName {
return ts, true
}
}
return batch.TaskSpec{}, false
}
// MakeDomainName creates task domain name
func MakeDomainName(ts batch.TaskSpec, job *batch.Job, index int) string {
hostName := ts.Template.Spec.Hostname
subdomain := ts.Template.Spec.Subdomain
if len(hostName) == 0 {
hostName = MakePodName(job.Name, ts.Name, index)
}
if len(subdomain) == 0 {
subdomain = job.Name
}
return hostName + "." + subdomain
}
// MakePodName creates pod name.
func MakePodName(jobName string, taskName string, index int) string {
return fmt.Sprintf(PodNameFmt, jobName, taskName, index)
}
// GenRandomStr generate random str with specified length l.
func GenRandomStr(l int) string {
str := "0123456789abcdefghijklmnopqrstuvwxyz"
bytes := []byte(str)
var result []byte
r := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := 0; i < l; i++ {
result = append(result, bytes[r.Intn(len(bytes))])
}
return string(result)
}
// GenPVCName generates pvc name with job name.
func GenPVCName(jobName string) string {
return fmt.Sprintf(persistentVolumeClaimFmt, jobName, GenRandomStr(12))
}
// GetJobKeyByReq gets the key for the job request.
func GetJobKeyByReq(req *apis.Request) string {
return fmt.Sprintf("%s/%s", req.Namespace, req.JobName)
}
// GetTasklndexUnderJob return index of the task in the job.
func GetTasklndexUnderJob(taskName string, job *batch.Job) int {
for index, task := range job.Spec.Tasks {
if task.Name == taskName {
return index
}
}
return -1
}
// GetPodsNameUnderTask return names of all pods in the task.
func GetPodsNameUnderTask(taskName string, job *batch.Job) []string {
var res []string
for _, task := range job.Spec.Tasks {
if task.Name == taskName {
for index := 0; index < int(task.Replicas); index++ {
res = append(res, MakePodName(job.Name, taskName, index))
}
break
}
}
return res
}
/*
Copyright 2017 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
"hash"
"hash/fnv"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
coreinformers "k8s.io/client-go/informers/core/v1"
kubeschedulinginformers "k8s.io/client-go/informers/scheduling/v1"
"k8s.io/client-go/kubernetes"
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
kubeschedulinglisters "k8s.io/client-go/listers/scheduling/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
vcscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
businformer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
schedulinginformers "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
buslister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
schedulinglisters "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
jobcache "volcano.sh/volcano/pkg/controllers/cache"
"volcano.sh/volcano/pkg/controllers/framework"
"volcano.sh/volcano/pkg/controllers/job/state"
)
func init() {
framework.RegisterController(&jobcontroller{})
}
// jobcontroller the Job jobcontroller type.
type jobcontroller struct {
kubeClient kubernetes.Interface
vcClient vcclientset.Interface
jobInformer batchinformer.JobInformer
podInformer coreinformers.PodInformer
pvcInformer coreinformers.PersistentVolumeClaimInformer
pgInformer schedulinginformers.PodGroupInformer
svcInformer coreinformers.ServiceInformer
cmdInformer businformer.CommandInformer
pcInformer kubeschedulinginformers.PriorityClassInformer
queueInformer schedulinginformers.QueueInformer
// A store of jobs
jobLister batchlister.JobLister
jobSynced func() bool
// A store of pods
podLister corelisters.PodLister
podSynced func() bool
pvcLister corelisters.PersistentVolumeClaimLister
pvcSynced func() bool
// A store of podgroups
pgLister schedulinglisters.PodGroupLister
pgSynced func() bool
// A store of service
svcLister corelisters.ServiceLister
svcSynced func() bool
cmdLister buslister.CommandLister
cmdSynced func() bool
pcLister kubeschedulinglisters.PriorityClassLister
pcSynced func() bool
queueLister schedulinglisters.QueueLister
queueSynced func() bool
// queue that need to sync up
queueList []workqueue.RateLimitingInterface
commandQueue workqueue.RateLimitingInterface
cache jobcache.Cache
// Job Event recorder
recorder record.EventRecorder
errTasks workqueue.RateLimitingInterface
workers uint32
maxRequeueNum int
}
func (cc *jobcontroller) Name() string {
return "job-controller"
}
// Initialize creates the new Job job controller.
func (cc *jobcontroller) Initialize(opt *framework.ControllerOption) error {
cc.kubeClient = opt.KubeClient
cc.vcClient = opt.VolcanoClient
sharedInformers := opt.SharedInformerFactory
workers := opt.WorkerNum
// Initialize event client
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(klog.Infof)
eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: cc.kubeClient.CoreV1().Events("")})
recorder := eventBroadcaster.NewRecorder(vcscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
cc.queueList = make([]workqueue.RateLimitingInterface, workers)
cc.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
cc.cache = jobcache.New()
cc.errTasks = newRateLimitingQueue()
cc.recorder = recorder
cc.workers = workers
cc.maxRequeueNum = opt.MaxRequeueNum
if cc.maxRequeueNum < 0 {
cc.maxRequeueNum = -1
}
var i uint32
for i = 0; i < workers; i++ {
cc.queueList[i] = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
}
cc.jobInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Batch().V1alpha1().Jobs()
cc.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: cc.addJob,
UpdateFunc: cc.updateJob,
DeleteFunc: cc.deleteJob,
})
cc.jobLister = cc.jobInformer.Lister()
cc.jobSynced = cc.jobInformer.Informer().HasSynced
cc.cmdInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Bus().V1alpha1().Commands()
cc.cmdInformer.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch v := obj.(type) {
case *busv1alpha1.Command:
if v.TargetObject != nil &&
v.TargetObject.APIVersion == batchv1alpha1.SchemeGroupVersion.String() &&
v.TargetObject.Kind == "Job" {
return true
}
return false
default:
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: cc.addCommand,
},
},
)
cc.cmdLister = cc.cmdInformer.Lister()
cc.cmdSynced = cc.cmdInformer.Informer().HasSynced
cc.podInformer = sharedInformers.Core().V1().Pods()
cc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: cc.addPod,
UpdateFunc: cc.updatePod,
DeleteFunc: cc.deletePod,
})
cc.podLister = cc.podInformer.Lister()
cc.podSynced = cc.podInformer.Informer().HasSynced
cc.pvcInformer = sharedInformers.Core().V1().PersistentVolumeClaims()
cc.pvcLister = cc.pvcInformer.Lister()
cc.pvcSynced = cc.pvcInformer.Informer().HasSynced
cc.svcInformer = sharedInformers.Core().V1().Services()
cc.svcLister = cc.svcInformer.Lister()
cc.svcSynced = cc.svcInformer.Informer().HasSynced
cc.pgInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Scheduling().V1beta1().PodGroups()
cc.pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: cc.updatePodGroup,
})
cc.pgLister = cc.pgInformer.Lister()
cc.pgSynced = cc.pgInformer.Informer().HasSynced
cc.pcInformer = sharedInformers.Scheduling().V1().PriorityClasses()
cc.pcLister = cc.pcInformer.Lister()
cc.pcSynced = cc.pcInformer.Informer().HasSynced
cc.queueInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Scheduling().V1beta1().Queues()
cc.queueLister = cc.queueInformer.Lister()
cc.queueSynced = cc.queueInformer.Informer().HasSynced
// Register actions
state.SyncJob = cc.syncJob
state.KillJob = cc.killJob
return nil
}
// Run start JobController.
func (cc *jobcontroller) Run(stopCh <-chan struct{}) {
go cc.jobInformer.Informer().Run(stopCh)
go cc.podInformer.Informer().Run(stopCh)
go cc.pvcInformer.Informer().Run(stopCh)
go cc.pgInformer.Informer().Run(stopCh)
go cc.svcInformer.Informer().Run(stopCh)
go cc.cmdInformer.Informer().Run(stopCh)
go cc.pcInformer.Informer().Run(stopCh)
go cc.queueInformer.Informer().Run(stopCh)
cache.WaitForCacheSync(stopCh, cc.jobSynced, cc.podSynced, cc.pgSynced,
cc.svcSynced, cc.cmdSynced, cc.pvcSynced, cc.pcSynced, cc.queueSynced)
go wait.Until(cc.handleCommands, 0, stopCh)
var i uint32
for i = 0; i < cc.workers; i++ {
go func(num uint32) {
wait.Until(
func() {
cc.worker(num)
},
time.Second,
stopCh)
}(i)
}
go cc.cache.Run(stopCh)
// Re-sync error tasks.
go wait.Until(cc.processResyncTask, 0, stopCh)
klog.Infof("JobController is running ...... ")
}
func (cc *jobcontroller) worker(i uint32) {
klog.Infof("worker %d start ...... ", i)
for cc.processNextReq(i) {
}
}
func (cc *jobcontroller) belongsToThisRoutine(key string, count uint32) bool {
var hashVal hash.Hash32
var val uint32
hashVal = fnv.New32()
hashVal.Write([]byte(key))
val = hashVal.Sum32()
return val%cc.workers == count
}
func (cc *jobcontroller) getWorkerQueue(key string) workqueue.RateLimitingInterface {
var hashVal hash.Hash32
var val uint32
hashVal = fnv.New32()
hashVal.Write([]byte(key))
val = hashVal.Sum32()
queue := cc.queueList[val%cc.workers]
return queue
}
func (cc *jobcontroller) processNextReq(count uint32) bool {
queue := cc.queueList[count]
obj, shutdown := queue.Get()
if shutdown {
klog.Errorf("Fail to pop item from queue")
return false
}
req := obj.(apis.Request)
defer queue.Done(req)
key := jobcache.JobKeyByReq(&req)
if !cc.belongsToThisRoutine(key, count) {
klog.Errorf("should not occur The job does not belongs to this routine key:%s, worker:%d...... ", key, count)
queueLocal := cc.getWorkerQueue(key)
queueLocal.Add(req)
return true
}
klog.V(3).Infof("Try to handle request <%v>", req)
jobInfo, err := cc.cache.Get(key)
if err != nil {
// TODO(k82cn): ignore not-ready error.
klog.Errorf("Failed to get job by <%v> from cache: %v", req, err)
return true
}
st := state.NewState(jobInfo)
if st == nil {
klog.Errorf("Invalid state <%s> of Job <%v/%v>",
jobInfo.Job.Status.State, jobInfo.Job.Namespace, jobInfo.Job.Name)
return true
}
action := applyPolicies(jobInfo.Job, &req)
klog.V(3).Infof("Execute <%v> on Job <%s/%s> in <%s> by <%T>.",
action, req.Namespace, req.JobName, jobInfo.Job.Status.State.Phase, st)
if action != busv1alpha1.SyncJobAction {
cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
"Start to execute action %s ", action))
}
if err := st.Execute(action); err != nil {
if cc.maxRequeueNum == -1 || queue.NumRequeues(req) < cc.maxRequeueNum {
klog.V(2).Infof("Failed to handle Job <%s/%s>: %v",
jobInfo.Job.Namespace, jobInfo.Job.Name, err)
// If any error, requeue it.
queue.AddRateLimited(req)
return true
}
cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
"Job failed on action %s for retry limit reached", action))
klog.Warningf("Terminating Job <%s/%s> and releasing resources", jobInfo.Job.Namespace, jobInfo.Job.Name)
if err = st.Execute(busv1alpha1.TerminateJobAction); err != nil {
klog.Errorf("Failed to terminate Job<%s/%s>: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
}
klog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
}
// If no error, forget it.
queue.Forget(req)
return true
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"reflect"
"sort"
"sync"
"sync/atomic"
"time"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
"volcano.sh/volcano/pkg/controllers/job/state"
)
var calMutex sync.Mutex
func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseMap, updateStatus state.UpdateStatusFn) error {
job := jobInfo.Job
klog.V(3).Infof("Killing Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version)
defer klog.V(3).Infof("Finished Job <%s/%s> killing, current version %d", job.Namespace, job.Name, job.Status.Version)
if job.DeletionTimestamp != nil {
klog.Infof("Job <%s/%s> is terminating, skip management process.",
job.Namespace, job.Name)
return nil
}
var pending, running, terminating, succeeded, failed, unknown int32
taskStatusCount := make(map[string]batch.TaskState)
var errs []error
var total int
for _, pods := range jobInfo.Pods {
for _, pod := range pods {
total++
if pod.DeletionTimestamp != nil {
klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
terminating++
continue
}
maxRetry := job.Spec.MaxRetry
lastRetry := false
if job.Status.RetryCount >= maxRetry-1 {
lastRetry = true
}
// Only retain the Failed and Succeeded pods at the last retry.
// If it is not the last retry, kill pod as defined in `podRetainPhase`.
retainPhase := podRetainPhase
if lastRetry {
retainPhase = state.PodRetainPhaseSoft
}
_, retain := retainPhase[pod.Status.Phase]
if !retain {
err := cc.deleteJobPod(job.Name, pod)
if err == nil {
terminating++
continue
}
// record the err, and then collect the pod info like retained pod
errs = append(errs, err)
cc.resyncTask(pod)
}
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
calcPodStatus(pod, taskStatusCount)
}
}
if len(errs) != 0 {
klog.Errorf("failed to kill pods for job %s/%s, with err %+v", job.Namespace, job.Name, errs)
cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
fmt.Sprintf("Error deleting pods: %+v", errs))
return fmt.Errorf("failed to kill %d pods of %d", len(errs), total)
}
job = job.DeepCopy()
// Job version is bumped only when job is killed
job.Status.Version++
job.Status.Pending = pending
job.Status.Running = running
job.Status.Succeeded = succeeded
job.Status.Failed = failed
job.Status.Terminating = terminating
job.Status.Unknown = unknown
job.Status.TaskStatusCount = taskStatusCount
// Update running duration
klog.V(3).Infof("Running duration is %s", metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}.ToUnstructured())
job.Status.RunningDuration = &metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}
if updateStatus != nil {
if updateStatus(&job.Status) {
job.Status.State.LastTransitionTime = metav1.Now()
jobCondition := newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
job.Status.Conditions = append(job.Status.Conditions, jobCondition)
}
}
// must be called before update job status
if err := cc.pluginOnJobDelete(job); err != nil {
return err
}
// Update Job status
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
if e := cc.cache.Update(newJob); e != nil {
klog.Errorf("KillJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, e)
return e
}
// Delete PodGroup
if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(context.TODO(), job.Name, metav1.DeleteOptions{}); err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
}
// NOTE(k82cn): DO NOT delete input/output until job is deleted.
return nil
}
func (cc *jobcontroller) initiateJob(job *batch.Job) (*batch.Job, error) {
klog.V(3).Infof("Starting to initiate Job <%s/%s>", job.Namespace, job.Name)
jobInstance, err := cc.initJobStatus(job)
if err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.JobStatusError),
fmt.Sprintf("Failed to initialize job status, err: %v", err))
return nil, err
}
if err := cc.pluginOnJobAdd(jobInstance); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
return nil, err
}
newJob, err := cc.createJobIOIfNotExist(jobInstance)
if err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PVCError),
fmt.Sprintf("Failed to create PVC, err: %v", err))
return nil, err
}
if err := cc.createOrUpdatePodGroup(newJob); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
fmt.Sprintf("Failed to create PodGroup, err: %v", err))
return nil, err
}
return newJob, nil
}
func (cc *jobcontroller) initOnJobUpdate(job *batch.Job) error {
klog.V(3).Infof("Starting to initiate Job <%s/%s> on update", job.Namespace, job.Name)
if err := cc.pluginOnJobUpdate(job); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
return err
}
if err := cc.createOrUpdatePodGroup(job); err != nil {
cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
fmt.Sprintf("Failed to create PodGroup, err: %v", err))
return err
}
return nil
}
func (cc *jobcontroller) GetQueueInfo(queue string) (*scheduling.Queue, error) {
queueInfo, err := cc.queueLister.Get(queue)
if err != nil {
klog.Errorf("Failed to get queue from queueLister, error: %s", err.Error())
}
return queueInfo, err
}
func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateStatusFn) error {
job := jobInfo.Job
klog.V(3).Infof("Starting to sync up Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version)
defer klog.V(3).Infof("Finished Job <%s/%s> sync up, current version %d", job.Namespace, job.Name, job.Status.Version)
if jobInfo.Job.DeletionTimestamp != nil {
klog.Infof("Job <%s/%s> is terminating, skip management process.",
jobInfo.Job.Namespace, jobInfo.Job.Name)
return nil
}
// deep copy job to prevent mutate it
job = job.DeepCopy()
// Find queue that job belongs to, and check if the queue has forwarding metadata
queueInfo, err := cc.GetQueueInfo(job.Spec.Queue)
if err != nil {
return err
}
var jobForwarding bool
if len(queueInfo.Spec.ExtendClusters) != 0 {
jobForwarding = true
if len(job.Annotations) == 0 {
job.Annotations = make(map[string]string)
}
job.Annotations[batch.JobForwardingKey] = "true"
job, err = cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error())
return err
}
}
// Skip job initiation if job is already initiated
if !isInitiated(job) {
if job, err = cc.initiateJob(job); err != nil {
return err
}
} else {
// TODO: optimize this call it only when scale up/down
if err = cc.initOnJobUpdate(job); err != nil {
return err
}
}
if len(queueInfo.Spec.ExtendClusters) != 0 {
jobForwarding = true
job.Annotations[batch.JobForwardingKey] = "true"
_, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error())
return err
}
}
var syncTask bool
if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(job.Name); pg != nil {
if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending {
syncTask = true
}
for _, condition := range pg.Status.Conditions {
if condition.Type == scheduling.PodGroupUnschedulableType {
cc.recorder.Eventf(job, v1.EventTypeWarning, string(batch.PodGroupPending),
fmt.Sprintf("PodGroup %s:%s unschedule,reason: %s", job.Namespace, job.Name, condition.Message))
}
}
}
var jobCondition batch.JobCondition
if !syncTask {
if updateStatus != nil {
if updateStatus(&job.Status) {
job.Status.State.LastTransitionTime = metav1.Now()
jobCondition = newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
job.Status.Conditions = append(job.Status.Conditions, jobCondition)
}
}
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
if e := cc.cache.Update(newJob); e != nil {
klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, e)
return e
}
return nil
}
var running, pending, terminating, succeeded, failed, unknown int32
taskStatusCount := make(map[string]batch.TaskState)
podToCreate := make(map[string][]*v1.Pod)
var podToDelete []*v1.Pod
var creationErrs []error
var deletionErrs []error
appendMutex := sync.Mutex{}
appendError := func(container *[]error, err error) {
appendMutex.Lock()
defer appendMutex.Unlock()
*container = append(*container, err)
}
waitCreationGroup := sync.WaitGroup{}
for _, ts := range job.Spec.Tasks {
ts.Template.Name = ts.Name
tc := ts.Template.DeepCopy()
name := ts.Template.Name
pods, found := jobInfo.Pods[name]
if !found {
pods = map[string]*v1.Pod{}
}
var podToCreateEachTask []*v1.Pod
for i := 0; i < int(ts.Replicas); i++ {
podName := fmt.Sprintf(jobhelpers.PodNameFmt, job.Name, name, i)
if pod, found := pods[podName]; !found {
newPod := createJobPod(job, tc, ts.TopologyPolicy, i, jobForwarding)
if err := cc.pluginOnPodCreate(job, newPod); err != nil {
return err
}
podToCreateEachTask = append(podToCreateEachTask, newPod)
waitCreationGroup.Add(1)
} else {
delete(pods, podName)
if pod.DeletionTimestamp != nil {
klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
atomic.AddInt32(&terminating, 1)
continue
}
classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
calcPodStatus(pod, taskStatusCount)
}
}
podToCreate[ts.Name] = podToCreateEachTask
for _, pod := range pods {
podToDelete = append(podToDelete, pod)
}
}
for taskName, podToCreateEachTask := range podToCreate {
if len(podToCreateEachTask) == 0 {
continue
}
go func(taskName string, podToCreateEachTask []*v1.Pod) {
taskIndex := jobhelpers.GetTasklndexUnderJob(taskName, job)
if job.Spec.Tasks[taskIndex].DependsOn != nil {
cc.waitDependsOnTaskMeetCondition(taskName, taskIndex, podToCreateEachTask, job)
}
for _, pod := range podToCreateEachTask {
go func(pod *v1.Pod) {
defer waitCreationGroup.Done()
newPod, err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
if err != nil && !apierrors.IsAlreadyExists(err) {
// Failed to create Pod, waitCreationGroup a moment and then create it again
// This is to ensure all podsMap under the same Job created
// So gang-scheduling could schedule the Job successfully
klog.Errorf("Failed to create pod %s for Job %s, err %#v",
pod.Name, job.Name, err)
appendError(&creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err))
} else {
classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown)
calcPodStatus(pod, taskStatusCount)
klog.V(5).Infof("Created Task <%s> of Job <%s/%s>",
pod.Name, job.Namespace, job.Name)
}
}(pod)
}
}(taskName, podToCreateEachTask)
}
waitCreationGroup.Wait()
if len(creationErrs) != 0 {
cc.recorder.Event(job, v1.EventTypeWarning, FailedCreatePodReason,
fmt.Sprintf("Error creating pods: %+v", creationErrs))
return fmt.Errorf("failed to create %d pods of %d", len(creationErrs), len(podToCreate))
}
// Delete pods when scale down.
waitDeletionGroup := sync.WaitGroup{}
waitDeletionGroup.Add(len(podToDelete))
for _, pod := range podToDelete {
go func(pod *v1.Pod) {
defer waitDeletionGroup.Done()
err := cc.deleteJobPod(job.Name, pod)
if err != nil {
// Failed to delete Pod, waitCreationGroup a moment and then create it again
// This is to ensure all podsMap under the same Job created
// So gang-scheduling could schedule the Job successfully
klog.Errorf("Failed to delete pod %s for Job %s, err %#v",
pod.Name, job.Name, err)
appendError(&deletionErrs, err)
cc.resyncTask(pod)
} else {
klog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>",
pod.Name, job.Namespace, job.Name)
atomic.AddInt32(&terminating, 1)
}
}(pod)
}
waitDeletionGroup.Wait()
if len(deletionErrs) != 0 {
cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
fmt.Sprintf("Error deleting pods: %+v", deletionErrs))
return fmt.Errorf("failed to delete %d pods of %d", len(deletionErrs), len(podToDelete))
}
job.Status = batch.JobStatus{
State: job.Status.State,
Pending: pending,
Running: running,
Succeeded: succeeded,
Failed: failed,
Terminating: terminating,
Unknown: unknown,
Version: job.Status.Version,
MinAvailable: job.Spec.MinAvailable,
TaskStatusCount: taskStatusCount,
ControlledResources: job.Status.ControlledResources,
Conditions: job.Status.Conditions,
RetryCount: job.Status.RetryCount,
}
if updateStatus != nil {
if updateStatus(&job.Status) {
job.Status.State.LastTransitionTime = metav1.Now()
jobCondition = newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
job.Status.Conditions = append(job.Status.Conditions, jobCondition)
}
}
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return err
}
if e := cc.cache.Update(newJob); e != nil {
klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, e)
return e
}
return nil
}
func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskIndex int, podToCreateEachTask []*v1.Pod, job *batch.Job) {
if job.Spec.Tasks[taskIndex].DependsOn != nil {
dependsOn := *job.Spec.Tasks[taskIndex].DependsOn
if len(dependsOn.Name) > 1 && dependsOn.Iteration == batch.IterationAny {
wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) {
for _, task := range dependsOn.Name {
if cc.isDependsOnPodsReady(task, job) {
return true, nil
}
}
return false, nil
})
} else {
for _, dependsOnTask := range dependsOn.Name {
wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) {
if cc.isDependsOnPodsReady(dependsOnTask, job) {
return true, nil
}
return false, nil
})
}
}
}
}
func (cc *jobcontroller) isDependsOnPodsReady(task string, job *batch.Job) bool {
dependsOnPods := jobhelpers.GetPodsNameUnderTask(task, job)
dependsOnTaskIndex := jobhelpers.GetTasklndexUnderJob(task, job)
runningPodCount := 0
for _, podName := range dependsOnPods {
pod, err := cc.podLister.Pods(job.Namespace).Get(podName)
if err != nil {
klog.Errorf("Failed to get pod %v/%v %v", job.Namespace, podName, err)
continue
}
if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodSucceeded {
klog.V(5).Infof("Sequential state, pod %v/%v of depends on tasks is not running", pod.Namespace, pod.Name)
continue
}
allContainerReady := true
for _, containerStatus := range pod.Status.ContainerStatuses {
if !containerStatus.Ready {
allContainerReady = false
break
}
}
if allContainerReady {
runningPodCount++
}
}
dependsOnTaskMinReplicas := job.Spec.Tasks[dependsOnTaskIndex].MinAvailable
if dependsOnTaskMinReplicas != nil {
if runningPodCount < int(*dependsOnTaskMinReplicas) {
klog.V(5).Infof("In a depends on startup state, there are already %d pods running, which is less than the minimum number of runs", runningPodCount)
return false
}
}
return true
}
func (cc *jobcontroller) createJobIOIfNotExist(job *batch.Job) (*batch.Job, error) {
// If PVC does not exist, create them for Job.
var needUpdate bool
if job.Status.ControlledResources == nil {
job.Status.ControlledResources = make(map[string]string)
}
for index, volume := range job.Spec.Volumes {
vcName := volume.VolumeClaimName
if len(vcName) == 0 {
// NOTE(k82cn): Ensure never have duplicated generated names.
for {
vcName = jobhelpers.GenPVCName(job.Name)
exist, err := cc.checkPVCExist(job, vcName)
if err != nil {
return job, err
}
if exist {
continue
}
job.Spec.Volumes[index].VolumeClaimName = vcName
needUpdate = true
break
}
// TODO: check VolumeClaim must be set if VolumeClaimName is empty
if volume.VolumeClaim != nil {
if err := cc.createPVC(job, vcName, volume.VolumeClaim); err != nil {
return job, err
}
}
} else {
exist, err := cc.checkPVCExist(job, vcName)
if err != nil {
return job, err
}
if !exist {
return job, fmt.Errorf("pvc %s is not found, the job will be in the Pending state until the PVC is created", vcName)
}
}
job.Status.ControlledResources["volume-pvc-"+vcName] = vcName
}
if needUpdate {
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update Job %v/%v for volume claim name: %v ",
job.Namespace, job.Name, err)
return job, err
}
newJob.Status = job.Status
return newJob, err
}
return job, nil
}
func (cc *jobcontroller) checkPVCExist(job *batch.Job, pvc string) (bool, error) {
if _, err := cc.pvcLister.PersistentVolumeClaims(job.Namespace).Get(pvc); err != nil {
if apierrors.IsNotFound(err) {
return false, nil
}
klog.V(3).Infof("Failed to get PVC %s for job <%s/%s>: %v",
pvc, job.Namespace, job.Name, err)
return false, err
}
return true, nil
}
func (cc *jobcontroller) createPVC(job *batch.Job, vcName string, volumeClaim *v1.PersistentVolumeClaimSpec) error {
pvc := &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Namespace: job.Namespace,
Name: vcName,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(job, helpers.JobKind),
},
},
Spec: *volumeClaim,
}
klog.V(3).Infof("Try to create PVC: %v", pvc)
if _, e := cc.kubeClient.CoreV1().PersistentVolumeClaims(job.Namespace).Create(context.TODO(), pvc, metav1.CreateOptions{}); e != nil {
klog.V(3).Infof("Failed to create PVC for Job <%s/%s>: %v",
job.Namespace, job.Name, e)
return e
}
return nil
}
func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error {
// If PodGroup does not exist, create one for Job.
pg, err := cc.pgLister.PodGroups(job.Namespace).Get(job.Name)
if err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
return err
}
minTaskMember := map[string]int32{}
for _, task := range job.Spec.Tasks {
if task.MinAvailable != nil {
minTaskMember[task.Name] = *task.MinAvailable
} else {
minTaskMember[task.Name] = task.Replicas
}
}
pg := &scheduling.PodGroup{
ObjectMeta: metav1.ObjectMeta{
Namespace: job.Namespace,
Name: job.Name,
Annotations: job.Annotations,
Labels: job.Labels,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(job, helpers.JobKind),
},
},
Spec: scheduling.PodGroupSpec{
MinMember: job.Spec.MinAvailable,
MinTaskMember: minTaskMember,
Queue: job.Spec.Queue,
MinResources: cc.calcPGMinResources(job),
PriorityClassName: job.Spec.PriorityClassName,
},
}
if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Create(context.TODO(), pg, metav1.CreateOptions{}); err != nil {
if !apierrors.IsAlreadyExists(err) {
klog.Errorf("Failed to create PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
return err
}
}
return nil
}
pgShouldUpdate := false
if pg.Spec.PriorityClassName != job.Spec.PriorityClassName {
pg.Spec.PriorityClassName = job.Spec.PriorityClassName
pgShouldUpdate = true
}
minResources := cc.calcPGMinResources(job)
if pg.Spec.MinMember != job.Spec.MinAvailable || !reflect.DeepEqual(pg.Spec.MinResources, minResources) {
pg.Spec.MinMember = job.Spec.MinAvailable
pg.Spec.MinResources = minResources
pgShouldUpdate = true
}
if pg.Spec.MinTaskMember == nil {
pgShouldUpdate = true
pg.Spec.MinTaskMember = make(map[string]int32)
}
for _, task := range job.Spec.Tasks {
if task.MinAvailable == nil {
continue
}
if taskMember, ok := pg.Spec.MinTaskMember[task.Name]; !ok {
pgShouldUpdate = true
pg.Spec.MinTaskMember[task.Name] = *task.MinAvailable
} else {
if taskMember == *task.MinAvailable {
continue
}
pgShouldUpdate = true
pg.Spec.MinTaskMember[task.Name] = *task.MinAvailable
}
}
if !pgShouldUpdate {
return nil
}
_, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Update(context.TODO(), pg, metav1.UpdateOptions{})
if err != nil {
klog.V(3).Infof("Failed to update PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
}
return err
}
func (cc *jobcontroller) deleteJobPod(jobName string, pod *v1.Pod) error {
err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})
if err != nil && !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete pod %s/%s for Job %s, err %#v",
pod.Namespace, pod.Name, jobName, err)
return fmt.Errorf("failed to delete pod %s, err %#v", pod.Name, err)
}
return nil
}
func (cc *jobcontroller) calcPGMinResources(job *batch.Job) *v1.ResourceList {
// sort task by priorityClasses
var tasksPriority TasksPriority
for _, task := range job.Spec.Tasks {
tp := TaskPriority{0, task}
pc := task.Template.Spec.PriorityClassName
priorityClass, err := cc.pcLister.Get(pc)
if err != nil || priorityClass == nil {
klog.Warningf("Ignore task %s priority class %s: %v", task.Name, pc, err)
} else {
tp.priority = priorityClass.Value
}
tasksPriority = append(tasksPriority, tp)
}
sort.Sort(tasksPriority)
minAvailableTasksRes := v1.ResourceList{}
podCnt := int32(0)
for _, task := range tasksPriority {
for i := int32(0); i < task.Replicas; i++ {
if podCnt >= job.Spec.MinAvailable {
break
}
podCnt++
for _, c := range task.Template.Spec.Containers {
addResourceList(minAvailableTasksRes, c.Resources.Requests, c.Resources.Limits)
}
}
}
return &minAvailableTasksRes
}
func (cc *jobcontroller) initJobStatus(job *batch.Job) (*batch.Job, error) {
if job.Status.State.Phase != "" {
return job, nil
}
job.Status.State.Phase = batch.Pending
job.Status.State.LastTransitionTime = metav1.Now()
job.Status.MinAvailable = job.Spec.MinAvailable
jobCondition := newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
job.Status.Conditions = append(job.Status.Conditions, jobCondition)
newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update status of Job %v/%v: %v",
job.Namespace, job.Name, err)
return nil, err
}
if err := cc.cache.Update(newJob); err != nil {
klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
newJob.Namespace, newJob.Name, err)
return nil, err
}
return newJob, nil
}
func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) {
switch pod.Status.Phase {
case v1.PodPending:
atomic.AddInt32(pending, 1)
case v1.PodRunning:
atomic.AddInt32(running, 1)
case v1.PodSucceeded:
atomic.AddInt32(succeeded, 1)
case v1.PodFailed:
atomic.AddInt32(failed, 1)
default:
atomic.AddInt32(unknown, 1)
}
}
func calcPodStatus(pod *v1.Pod, taskStatusCount map[string]batch.TaskState) {
taskName, found := pod.Annotations[batch.TaskSpecKey]
if !found {
return
}
calMutex.Lock()
defer calMutex.Unlock()
if _, ok := taskStatusCount[taskName]; !ok {
taskStatusCount[taskName] = batch.TaskState{
Phase: make(map[v1.PodPhase]int32),
}
}
switch pod.Status.Phase {
case v1.PodPending:
taskStatusCount[taskName].Phase[v1.PodPending]++
case v1.PodRunning:
taskStatusCount[taskName].Phase[v1.PodRunning]++
case v1.PodSucceeded:
taskStatusCount[taskName].Phase[v1.PodSucceeded]++
case v1.PodFailed:
taskStatusCount[taskName].Phase[v1.PodFailed]++
default:
taskStatusCount[taskName].Phase[v1.PodUnknown]++
}
}
func isInitiated(job *batch.Job) bool {
if job.Status.State.Phase == "" || job.Status.State.Phase == batch.Pending {
return false
}
return true
}
func newCondition(status batch.JobPhase, lastTransitionTime *metav1.Time) batch.JobCondition {
return batch.JobCondition{
Status: status,
LastTransitionTime: lastTransitionTime,
}
}
/*
Copyright 2017 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"reflect"
"strconv"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
bus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
jobcache "volcano.sh/volcano/pkg/controllers/cache"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
)
func (cc *jobcontroller) addCommand(obj interface{}) {
cmd, ok := obj.(*bus.Command)
if !ok {
klog.Errorf("obj is not Command")
return
}
cc.commandQueue.Add(cmd)
}
func (cc *jobcontroller) addJob(obj interface{}) {
job, ok := obj.(*batch.Job)
if !ok {
klog.Errorf("obj is not Job")
return
}
req := apis.Request{
Namespace: job.Namespace,
JobName: job.Name,
Event: bus.OutOfSyncEvent,
}
// TODO(k82cn): if failed to add job, the cache should be refresh
if err := cc.cache.Add(job); err != nil {
klog.Errorf("Failed to add job <%s/%s>: %v in cache",
job.Namespace, job.Name, err)
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
}
func (cc *jobcontroller) updateJob(oldObj, newObj interface{}) {
newJob, ok := newObj.(*batch.Job)
if !ok {
klog.Errorf("newObj is not Job")
return
}
oldJob, ok := oldObj.(*batch.Job)
if !ok {
klog.Errorf("oldJob is not Job")
return
}
// No need to update if ResourceVersion is not changed
if newJob.ResourceVersion == oldJob.ResourceVersion {
klog.V(6).Infof("No need to update because job is not modified.")
return
}
if err := cc.cache.Update(newJob); err != nil {
klog.Errorf("UpdateJob - Failed to update job <%s/%s>: %v in cache",
newJob.Namespace, newJob.Name, err)
}
// NOTE: Since we only reconcile job based on Spec, we will ignore other attributes
// For Job status, it's used internally and always been updated via our controller.
if reflect.DeepEqual(newJob.Spec, oldJob.Spec) && newJob.Status.State.Phase == oldJob.Status.State.Phase {
klog.V(6).Infof("Job update event is ignored since no update in 'Spec'.")
return
}
req := apis.Request{
Namespace: newJob.Namespace,
JobName: newJob.Name,
Event: bus.OutOfSyncEvent,
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
}
func (cc *jobcontroller) deleteJob(obj interface{}) {
job, ok := obj.(*batch.Job)
if !ok {
// If we reached here it means the Job was deleted but its final state is unrecorded.
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Couldn't get object from tombstone %#v", obj)
return
}
job, ok = tombstone.Obj.(*batch.Job)
if !ok {
klog.Errorf("Tombstone contained object that is not a volcano Job: %#v", obj)
return
}
}
if err := cc.cache.Delete(job); err != nil {
klog.Errorf("Failed to delete job <%s/%s>: %v in cache",
job.Namespace, job.Name, err)
}
}
func (cc *jobcontroller) addPod(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
klog.Errorf("Failed to convert %v to v1.Pod", obj)
return
}
// Filter out pods that are not created from volcano job
if !isControlledBy(pod, helpers.JobKind) {
return
}
jobName, found := pod.Annotations[batch.JobNameKey]
if !found {
klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
pod.Namespace, pod.Name)
return
}
version, found := pod.Annotations[batch.JobVersion]
if !found {
klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
pod.Namespace, pod.Name)
return
}
dVersion, err := strconv.Atoi(version)
if err != nil {
klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
pod.Namespace, pod.Name)
return
}
if pod.DeletionTimestamp != nil {
cc.deletePod(pod)
return
}
req := apis.Request{
Namespace: pod.Namespace,
JobName: jobName,
Event: bus.OutOfSyncEvent,
JobVersion: int32(dVersion),
}
if err := cc.cache.AddPod(pod); err != nil {
klog.Errorf("Failed to add Pod <%s/%s>: %v to cache",
pod.Namespace, pod.Name, err)
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
}
func (cc *jobcontroller) updatePod(oldObj, newObj interface{}) {
oldPod, ok := oldObj.(*v1.Pod)
if !ok {
klog.Errorf("Failed to convert %v to v1.Pod", oldObj)
return
}
newPod, ok := newObj.(*v1.Pod)
if !ok {
klog.Errorf("Failed to convert %v to v1.Pod", newObj)
return
}
// Filter out pods that are not created from volcano job
if !isControlledBy(newPod, helpers.JobKind) {
return
}
if newPod.ResourceVersion == oldPod.ResourceVersion {
return
}
if newPod.DeletionTimestamp != nil {
cc.deletePod(newObj)
return
}
taskName, found := newPod.Annotations[batch.TaskSpecKey]
if !found {
klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
newPod.Namespace, newPod.Name)
return
}
jobName, found := newPod.Annotations[batch.JobNameKey]
if !found {
klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
newPod.Namespace, newPod.Name)
return
}
version, found := newPod.Annotations[batch.JobVersion]
if !found {
klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
newPod.Namespace, newPod.Name)
return
}
dVersion, err := strconv.Atoi(version)
if err != nil {
klog.Infof("Failed to convert jobVersion of Pod into number <%s/%s>, skipping",
newPod.Namespace, newPod.Name)
return
}
if err := cc.cache.UpdatePod(newPod); err != nil {
klog.Errorf("Failed to update Pod <%s/%s>: %v in cache",
newPod.Namespace, newPod.Name, err)
}
event := bus.OutOfSyncEvent
var exitCode int32
switch newPod.Status.Phase {
case v1.PodFailed:
if oldPod.Status.Phase != v1.PodFailed {
event = bus.PodFailedEvent
// TODO: currently only one container pod is supported by volcano
// Once multi containers pod is supported, update accordingly.
if len(newPod.Status.ContainerStatuses) > 0 && newPod.Status.ContainerStatuses[0].State.Terminated != nil {
exitCode = newPod.Status.ContainerStatuses[0].State.Terminated.ExitCode
}
}
case v1.PodSucceeded:
if oldPod.Status.Phase != v1.PodSucceeded &&
cc.cache.TaskCompleted(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
event = bus.TaskCompletedEvent
}
case v1.PodPending, v1.PodRunning:
if cc.cache.TaskFailed(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
event = bus.TaskFailedEvent
}
}
req := apis.Request{
Namespace: newPod.Namespace,
JobName: jobName,
TaskName: taskName,
Event: event,
ExitCode: exitCode,
JobVersion: int32(dVersion),
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
}
func (cc *jobcontroller) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
// If we reached here it means the pod was deleted but its final state is unrecorded.
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Couldn't get object from tombstone %#v", obj)
return
}
pod, ok = tombstone.Obj.(*v1.Pod)
if !ok {
klog.Errorf("Tombstone contained object that is not a Pod: %#v", obj)
return
}
}
// Filter out pods that are not created from volcano job
if !isControlledBy(pod, helpers.JobKind) {
return
}
taskName, found := pod.Annotations[batch.TaskSpecKey]
if !found {
klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
pod.Namespace, pod.Name)
return
}
jobName, found := pod.Annotations[batch.JobNameKey]
if !found {
klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
pod.Namespace, pod.Name)
return
}
version, found := pod.Annotations[batch.JobVersion]
if !found {
klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
pod.Namespace, pod.Name)
return
}
dVersion, err := strconv.Atoi(version)
if err != nil {
klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
pod.Namespace, pod.Name)
return
}
req := apis.Request{
Namespace: pod.Namespace,
JobName: jobName,
TaskName: taskName,
Event: bus.PodEvictedEvent,
JobVersion: int32(dVersion),
}
if err := cc.cache.DeletePod(pod); err != nil {
klog.Errorf("Failed to delete Pod <%s/%s>: %v in cache",
pod.Namespace, pod.Name, err)
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
}
func (cc *jobcontroller) recordJobEvent(namespace, name string, event batch.JobEvent, message string) {
job, err := cc.cache.Get(jobcache.JobKeyByName(namespace, name))
if err != nil {
klog.Warningf("Failed to find job in cache when reporting job event <%s/%s>: %v",
namespace, name, err)
return
}
cc.recorder.Event(job.Job, v1.EventTypeNormal, string(event), message)
}
func (cc *jobcontroller) handleCommands() {
for cc.processNextCommand() {
}
}
func (cc *jobcontroller) processNextCommand() bool {
obj, shutdown := cc.commandQueue.Get()
if shutdown {
return false
}
cmd := obj.(*bus.Command)
defer cc.commandQueue.Done(cmd)
if err := cc.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{}); err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete Command <%s/%s>.", cmd.Namespace, cmd.Name)
cc.commandQueue.AddRateLimited(cmd)
}
return true
}
cc.recordJobEvent(cmd.Namespace, cmd.TargetObject.Name,
batch.CommandIssued,
fmt.Sprintf(
"Start to execute command %s, and clean it up to make sure executed not more than once.", cmd.Action))
req := apis.Request{
Namespace: cmd.Namespace,
JobName: cmd.TargetObject.Name,
Event: bus.CommandIssuedEvent,
Action: bus.Action(cmd.Action),
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
return true
}
func (cc *jobcontroller) updatePodGroup(oldObj, newObj interface{}) {
oldPG, ok := oldObj.(*scheduling.PodGroup)
if !ok {
klog.Errorf("Failed to convert %v to PodGroup", newObj)
return
}
newPG, ok := newObj.(*scheduling.PodGroup)
if !ok {
klog.Errorf("Failed to convert %v to PodGroup", newObj)
return
}
_, err := cc.cache.Get(jobcache.JobKeyByName(newPG.Namespace, newPG.Name))
if err != nil && newPG.Annotations != nil {
klog.Warningf(
"Failed to find job in cache by PodGroup, this may not be a PodGroup for volcano job.")
}
if newPG.Status.Phase != oldPG.Status.Phase {
req := apis.Request{
Namespace: newPG.Namespace,
JobName: newPG.Name,
}
switch newPG.Status.Phase {
case scheduling.PodGroupUnknown:
req.Event = bus.JobUnknownEvent
}
key := jobhelpers.GetJobKeyByReq(&req)
queue := cc.getWorkerQueue(key)
queue.Add(req)
}
}
// TODO(k82cn): add handler for PodGroup unschedulable event.
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/volcano/pkg/controllers/job/plugins"
pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
)
func (cc *jobcontroller) pluginOnPodCreate(job *batch.Job, pod *v1.Pod) error {
client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
for name, args := range job.Spec.Plugins {
pb, found := plugins.GetPluginBuilder(name)
if !found {
err := fmt.Errorf("failed to get plugin %s", name)
klog.Error(err)
return err
}
klog.Infof("Starting to execute plugin at <pluginOnPodCreate>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
if err := pb(client, args).OnPodCreate(pod, job); err != nil {
klog.Errorf("Failed to process on pod create plugin %s, err %v.", name, err)
return err
}
}
return nil
}
func (cc *jobcontroller) pluginOnJobAdd(job *batch.Job) error {
client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
if job.Status.ControlledResources == nil {
job.Status.ControlledResources = make(map[string]string)
}
for name, args := range job.Spec.Plugins {
pb, found := plugins.GetPluginBuilder(name)
if !found {
err := fmt.Errorf("failed to get plugin %s", name)
klog.Error(err)
return err
}
klog.Infof("Starting to execute plugin at <pluginOnJobAdd>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
if err := pb(client, args).OnJobAdd(job); err != nil {
klog.Errorf("Failed to process on job add plugin %s, err %v.", name, err)
return err
}
}
return nil
}
func (cc *jobcontroller) pluginOnJobDelete(job *batch.Job) error {
if job.Status.ControlledResources == nil {
job.Status.ControlledResources = make(map[string]string)
}
client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
for name, args := range job.Spec.Plugins {
pb, found := plugins.GetPluginBuilder(name)
if !found {
err := fmt.Errorf("failed to get plugin %s", name)
klog.Error(err)
return err
}
klog.Infof("Starting to execute plugin at <pluginOnJobDelete>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
if err := pb(client, args).OnJobDelete(job); err != nil {
klog.Errorf("failed to process on job delete plugin %s, err %v.", name, err)
return err
}
}
return nil
}
func (cc *jobcontroller) pluginOnJobUpdate(job *batch.Job) error {
client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
if job.Status.ControlledResources == nil {
job.Status.ControlledResources = make(map[string]string)
}
for name, args := range job.Spec.Plugins {
pb, found := plugins.GetPluginBuilder(name)
if !found {
err := fmt.Errorf("failed to get plugin %s", name)
klog.Error(err)
return err
}
klog.Infof("Starting to execute plugin at <pluginOnJobUpdate>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
if err := pb(client, args).OnJobUpdate(job); err != nil {
klog.Errorf("Failed to process on job update plugin %s, err %v.", name, err)
return err
}
}
return nil
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"fmt"
"time"
"golang.org/x/time/rate"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
)
func newRateLimitingQueue() workqueue.RateLimitingInterface {
return workqueue.NewRateLimitingQueue(workqueue.NewMaxOfRateLimiter(
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
// 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item)
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
))
}
func (cc *jobcontroller) processResyncTask() {
obj, shutdown := cc.errTasks.Get()
if shutdown {
return
}
// one task only resync 10 times
if cc.errTasks.NumRequeues(obj) > 10 {
cc.errTasks.Forget(obj)
return
}
defer cc.errTasks.Done(obj)
task, ok := obj.(*v1.Pod)
if !ok {
klog.Errorf("failed to convert %v to *v1.Pod", obj)
return
}
if err := cc.syncTask(task); err != nil {
klog.Errorf("Failed to sync pod <%v/%v>, retry it, err %v", task.Namespace, task.Name, err)
cc.resyncTask(task)
}
}
func (cc *jobcontroller) syncTask(oldTask *v1.Pod) error {
newPod, err := cc.kubeClient.CoreV1().Pods(oldTask.Namespace).Get(context.TODO(), oldTask.Name, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
if err := cc.cache.DeletePod(oldTask); err != nil {
klog.Errorf("failed to delete cache pod <%v/%v>, err %v.", oldTask.Namespace, oldTask.Name, err)
return err
}
klog.V(3).Infof("Pod <%v/%v> was deleted, removed from cache.", oldTask.Namespace, oldTask.Name)
return nil
}
return fmt.Errorf("failed to get Pod <%v/%v>: err %v", oldTask.Namespace, oldTask.Name, err)
}
return cc.cache.UpdatePod(newPod)
}
func (cc *jobcontroller) resyncTask(task *v1.Pod) {
cc.errTasks.AddRateLimited(task)
}
/*
Copyright 2017 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
)
var detectionPeriodOfDependsOntask time.Duration
// MakePodName append podname,jobname,taskName and index and returns the string.
func MakePodName(jobName string, taskName string, index int) string {
return fmt.Sprintf(jobhelpers.PodNameFmt, jobName, taskName, index)
}
func createJobPod(job *batch.Job, template *v1.PodTemplateSpec, topologyPolicy batch.NumaPolicy, ix int, jobForwarding bool) *v1.Pod {
templateCopy := template.DeepCopy()
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: jobhelpers.MakePodName(job.Name, template.Name, ix),
Namespace: job.Namespace,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(job, helpers.JobKind),
},
Labels: templateCopy.Labels,
Annotations: templateCopy.Annotations,
},
Spec: templateCopy.Spec,
}
// If no scheduler name in Pod, use scheduler name from Job.
if len(pod.Spec.SchedulerName) == 0 {
pod.Spec.SchedulerName = job.Spec.SchedulerName
}
volumeMap := make(map[string]string)
for _, volume := range job.Spec.Volumes {
vcName := volume.VolumeClaimName
name := fmt.Sprintf("%s-%s", job.Name, jobhelpers.GenRandomStr(12))
if _, ok := volumeMap[vcName]; !ok {
volume := v1.Volume{
Name: name,
VolumeSource: v1.VolumeSource{
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
ClaimName: vcName,
},
},
}
pod.Spec.Volumes = append(pod.Spec.Volumes, volume)
volumeMap[vcName] = name
} else {
// duplicate volumes, should be prevented
continue
}
for i, c := range pod.Spec.Containers {
vm := v1.VolumeMount{
MountPath: volume.MountPath,
Name: name,
}
pod.Spec.Containers[i].VolumeMounts = append(c.VolumeMounts, vm)
}
}
tsKey := templateCopy.Name
if len(tsKey) == 0 {
tsKey = batch.DefaultTaskSpec
}
if len(pod.Annotations) == 0 {
pod.Annotations = make(map[string]string)
}
pod.Annotations[batch.TaskSpecKey] = tsKey
pod.Annotations[schedulingv2.KubeGroupNameAnnotationKey] = job.Name
pod.Annotations[batch.JobNameKey] = job.Name
pod.Annotations[batch.QueueNameKey] = job.Spec.Queue
pod.Annotations[batch.JobVersion] = fmt.Sprintf("%d", job.Status.Version)
pod.Annotations[batch.PodTemplateKey] = fmt.Sprintf("%s-%s", job.Name, template.Name)
if topologyPolicy != "" {
pod.Annotations[schedulingv2.NumaPolicyKey] = string(topologyPolicy)
}
if len(job.Annotations) > 0 {
if value, found := job.Annotations[schedulingv2.PodPreemptable]; found {
pod.Annotations[schedulingv2.PodPreemptable] = value
}
if value, found := job.Annotations[schedulingv2.RevocableZone]; found {
pod.Annotations[schedulingv2.RevocableZone] = value
}
if value, found := job.Annotations[schedulingv2.JDBMinAvailable]; found {
pod.Annotations[schedulingv2.JDBMinAvailable] = value
} else if value, found := job.Annotations[schedulingv2.JDBMaxUnavailable]; found {
pod.Annotations[schedulingv2.JDBMaxUnavailable] = value
}
}
if len(pod.Labels) == 0 {
pod.Labels = make(map[string]string)
}
// Set pod labels for Service.
pod.Labels[batch.JobNameKey] = job.Name
pod.Labels[batch.TaskSpecKey] = tsKey
pod.Labels[batch.JobNamespaceKey] = job.Namespace
pod.Labels[batch.QueueNameKey] = job.Spec.Queue
if len(job.Labels) > 0 {
if value, found := job.Labels[schedulingv2.PodPreemptable]; found {
pod.Labels[schedulingv2.PodPreemptable] = value
}
}
if jobForwarding {
pod.Annotations[batch.JobForwardingKey] = "true"
pod.Labels[batch.JobForwardingKey] = "true"
}
return pod
}
func applyPolicies(job *batch.Job, req *apis.Request) v1alpha1.Action {
if len(req.Action) != 0 {
return req.Action
}
if req.Event == v1alpha1.OutOfSyncEvent {
return v1alpha1.SyncJobAction
}
// For all the requests triggered from discarded job resources will perform sync action instead
if req.JobVersion < job.Status.Version {
klog.Infof("Request %s is outdated, will perform sync instead.", req)
return v1alpha1.SyncJobAction
}
// Overwrite Job level policies
if len(req.TaskName) != 0 {
// Parse task level policies
for _, task := range job.Spec.Tasks {
if task.Name == req.TaskName {
for _, policy := range task.Policies {
policyEvents := getEventlist(policy)
if len(policyEvents) > 0 && len(req.Event) > 0 {
if checkEventExist(policyEvents, req.Event) || checkEventExist(policyEvents, v1alpha1.AnyEvent) {
return policy.Action
}
}
// 0 is not an error code, is prevented in validation admission controller
if policy.ExitCode != nil && *policy.ExitCode == req.ExitCode {
return policy.Action
}
}
break
}
}
}
// Parse Job level policies
for _, policy := range job.Spec.Policies {
policyEvents := getEventlist(policy)
if len(policyEvents) > 0 && len(req.Event) > 0 {
if checkEventExist(policyEvents, req.Event) || checkEventExist(policyEvents, v1alpha1.AnyEvent) {
return policy.Action
}
}
// 0 is not an error code, is prevented in validation admission controller
if policy.ExitCode != nil && *policy.ExitCode == req.ExitCode {
return policy.Action
}
}
return v1alpha1.SyncJobAction
}
func getEventlist(policy batch.LifecyclePolicy) []v1alpha1.Event {
policyEventsList := policy.Events
if len(policy.Event) > 0 {
policyEventsList = append(policyEventsList, policy.Event)
}
return policyEventsList
}
func checkEventExist(policyEvents []v1alpha1.Event, reqEvent v1alpha1.Event) bool {
for _, event := range policyEvents {
if event == reqEvent {
return true
}
}
return false
}
func addResourceList(list, req, limit v1.ResourceList) {
for name, quantity := range req {
if value, ok := list[name]; !ok {
list[name] = quantity.DeepCopy()
} else {
value.Add(quantity)
list[name] = value
}
}
if req != nil {
return
}
// If Requests is omitted for a container,
// it defaults to Limits if that is explicitly specified.
for name, quantity := range limit {
if value, ok := list[name]; !ok {
list[name] = quantity.DeepCopy()
} else {
value.Add(quantity)
list[name] = value
}
}
}
// TaskPriority structure.
type TaskPriority struct {
priority int32
batch.TaskSpec
}
// TasksPriority is a slice of TaskPriority.
type TasksPriority []TaskPriority
func (p TasksPriority) Len() int { return len(p) }
func (p TasksPriority) Less(i, j int) bool {
return p[i].priority > p[j].priority
}
func (p TasksPriority) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func isControlledBy(obj metav1.Object, gvk schema.GroupVersionKind) bool {
controllerRef := metav1.GetControllerOf(obj)
if controllerRef == nil {
return false
}
if controllerRef.APIVersion == gvk.GroupVersion().String() && controllerRef.Kind == gvk.Kind {
return true
}
return false
}
func SetDetectionPeriodOfDependsOntask(period time.Duration) {
detectionPeriodOfDependsOntask = period
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tensorflow
import (
"encoding/json"
"flag"
"fmt"
"strconv"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
)
const (
DefaultPort = 2222
TFConfig = "TF_CONFIG"
)
type tensorflowPlugin struct {
tfArguments []string
Clientset pluginsinterface.PluginClientset
psName string
workerName string
chiefName string
evaluatorName string
port int
}
// New creates tensorflow plugin.
func New(client pluginsinterface.PluginClientset, arguments []string) pluginsinterface.PluginInterface {
tp := tensorflowPlugin{tfArguments: arguments, Clientset: client}
tp.addFlags()
return &tp
}
func (tp *tensorflowPlugin) addFlags() {
flagSet := flag.NewFlagSet(tp.Name(), flag.ContinueOnError)
flagSet.StringVar(&tp.psName, "ps", "ps", "name of ps role task")
flagSet.StringVar(&tp.workerName, "worker", "worker", "name of ps role task")
flagSet.StringVar(&tp.chiefName, "chief", "chief", "name of chief role task")
flagSet.StringVar(&tp.evaluatorName, "evaluator", "evaluator", "name of evaluator role task")
flagSet.IntVar(&tp.port, "port", DefaultPort, "service port")
if err := flagSet.Parse(tp.tfArguments); err != nil {
klog.Errorf("plugin %s flagset parse failed, err: %v", tp.Name(), err)
}
}
func (tp *tensorflowPlugin) Name() string {
return "tensorflow"
}
func (tp *tensorflowPlugin) OnPodCreate(pod *v1.Pod, job *batch.Job) error {
// No need to generate TF_CONFIG for stand-alone tensorflow job
if len(job.Spec.Tasks) == 1 && job.Spec.Tasks[0].Replicas == 1 {
return nil
}
// Generate TF_CONFIG value
spec, err := tp.generateTFClusterSpec(pod, job)
if err != nil {
return err
}
raw, err := json.Marshal(spec)
if err != nil {
return err
}
// Add TF_CONFIG enviroment variables
for i := range pod.Spec.Containers {
pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, v1.EnvVar{
Name: TFConfig,
Value: string(raw),
})
}
return nil
}
func (tp *tensorflowPlugin) OnJobAdd(job *batch.Job) error {
if job.Status.ControlledResources["plugin-"+tp.Name()] == tp.Name() {
return nil
}
job.Status.ControlledResources["plugin-"+tp.Name()] = tp.Name()
return nil
}
func (tp *tensorflowPlugin) OnJobDelete(job *batch.Job) error {
if job.Status.ControlledResources["plugin-"+tp.Name()] != tp.Name() {
return nil
}
delete(job.Status.ControlledResources, "plugin-"+tp.Name())
return nil
}
func (tp *tensorflowPlugin) OnJobUpdate(job *batch.Job) error {
return nil
}
func (tp *tensorflowPlugin) generateTFClusterSpec(pod *v1.Pod, job *batch.Job) (tfClusterSpec, error) {
index, err := strconv.Atoi(jobhelpers.GetPodIndexUnderTask(pod))
if err != nil {
return tfClusterSpec{}, err
}
// Generate tensorflow task info
c := tfClusterSpec{
Task: taskInfo{
Type: tp.getTaskType(jobhelpers.GetTaskKey(pod)),
Index: index,
},
}
// Generate tensorflow cluster info
for _, ts := range job.Spec.Tasks {
hosts := []string{}
for i := 0; i < int(ts.Replicas); i++ {
hosts = append(hosts, fmt.Sprintf("%s:%d", jobhelpers.MakeDomainName(ts, job, i), tp.port))
}
switch ts.Name {
case tp.psName:
c.Cluster.PS = hosts
case tp.workerName:
c.Cluster.Worker = hosts
case tp.chiefName:
c.Cluster.Chief = hosts
case tp.evaluatorName:
c.Cluster.Evaluator = hosts
}
}
return c, nil
}
func (tp *tensorflowPlugin) getTaskType(taskKey string) tfTaskType {
switch taskKey {
case tp.chiefName:
return tfChief
case tp.workerName:
return tfWorker
case tp.psName:
return tfPS
case tp.evaluatorName:
return tfEvaluator
}
return tfTaskType(taskKey)
}
// TfClusterSpec is the spec of a tensorflow cluster
// It will be injected into container's environment variables, and be used by tensorflow framework.
// e.g.
// {
// "cluster": {
// "worker": ["worker-0:2222", "worker-1:2222"],
// "ps": ["ps-0:2222"]
// },
// "task": {
// "type": "worker",
// "index": 0
// }
// }
type tfClusterSpec struct {
Cluster clusterInfo `json:"cluster"`
Task taskInfo `json:"task"`
}
type clusterInfo struct {
PS []string `json:"ps,omitempty"`
Worker []string `json:"worker,omitempty"`
Chief []string `json:"chief,omitempty"`
Evaluator []string `json:"evaluator,omitempty"`
}
type tfTaskType string
const (
tfWorker tfTaskType = "worker"
tfChief tfTaskType = "chief"
tfPS tfTaskType = "ps"
tfEvaluator tfTaskType = "evaluator"
)
type taskInfo struct {
Type tfTaskType `json:"type"`
Index int `json:"index"`
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ssh
import (
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"encoding/pem"
"flag"
"fmt"
"golang.org/x/crypto/ssh"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
)
type sshPlugin struct {
// Arguments given for the plugin
pluginArguments []string
client pluginsinterface.PluginClientset
// flag parse args
sshKeyFilePath string
// private key string
sshPrivateKey string
// public key string
sshPublicKey string
}
// New creates ssh plugin
func New(client pluginsinterface.PluginClientset, arguments []string) pluginsinterface.PluginInterface {
p := sshPlugin{
pluginArguments: arguments,
client: client,
sshKeyFilePath: SSHAbsolutePath,
}
p.addFlags()
return &p
}
func (sp *sshPlugin) Name() string {
return "ssh"
}
func (sp *sshPlugin) OnPodCreate(pod *v1.Pod, job *batch.Job) error {
sp.mountRsaKey(pod, job)
return nil
}
func (sp *sshPlugin) OnJobAdd(job *batch.Job) error {
if job.Status.ControlledResources["plugin-"+sp.Name()] == sp.Name() {
return nil
}
var data map[string][]byte
var err error
if len(sp.sshPrivateKey) > 0 {
data, err = withUserProvidedRsaKey(job, sp.sshPrivateKey, sp.sshPublicKey)
} else {
data, err = generateRsaKey(job)
}
if err != nil {
return err
}
if err := helpers.CreateOrUpdateSecret(job, sp.client.KubeClients, data, sp.secretName(job)); err != nil {
return fmt.Errorf("create secret for job <%s/%s> with ssh plugin failed for %v",
job.Namespace, job.Name, err)
}
job.Status.ControlledResources["plugin-"+sp.Name()] = sp.Name()
return nil
}
func (sp *sshPlugin) OnJobDelete(job *batch.Job) error {
if job.Status.ControlledResources["plugin-"+sp.Name()] != sp.Name() {
return nil
}
if err := helpers.DeleteSecret(job, sp.client.KubeClients, sp.secretName(job)); err != nil {
return err
}
delete(job.Status.ControlledResources, "plugin-"+sp.Name())
return nil
}
// TODO: currently a container using a Secret as a subPath volume mount will not receive Secret updates.
// we may not update the job secret due to the above reason now.
// related issue: https://github.com/volcano-sh/volcano/issues/1420
func (sp *sshPlugin) OnJobUpdate(job *batch.Job) error {
//data, err := generateRsaKey(job)
//if err != nil {
// return err
//}
//
//if err := helpers.CreateOrUpdateSecret(job, sp.client.KubeClients, data, sp.secretName(job)); err != nil {
// return fmt.Errorf("update secret for job <%s/%s> with ssh plugin failed for %v",
// job.Namespace, job.Name, err)
//}
return nil
}
func (sp *sshPlugin) mountRsaKey(pod *v1.Pod, job *batch.Job) {
secretName := sp.secretName(job)
sshVolume := v1.Volume{
Name: secretName,
}
var mode int32 = 0600
sshVolume.Secret = &v1.SecretVolumeSource{
SecretName: secretName,
Items: []v1.KeyToPath{
{
Key: SSHPrivateKey,
Path: SSHRelativePath + "/" + SSHPrivateKey,
},
{
Key: SSHPublicKey,
Path: SSHRelativePath + "/" + SSHPublicKey,
},
{
Key: SSHAuthorizedKeys,
Path: SSHRelativePath + "/" + SSHAuthorizedKeys,
},
{
Key: SSHConfig,
Path: SSHRelativePath + "/" + SSHConfig,
},
},
DefaultMode: &mode,
}
if sp.sshKeyFilePath != SSHAbsolutePath {
var noRootMode int32 = 0600
sshVolume.Secret.DefaultMode = &noRootMode
}
pod.Spec.Volumes = append(pod.Spec.Volumes, sshVolume)
for i, c := range pod.Spec.Containers {
vm := v1.VolumeMount{
MountPath: sp.sshKeyFilePath,
SubPath: SSHRelativePath,
Name: secretName,
}
pod.Spec.Containers[i].VolumeMounts = append(c.VolumeMounts, vm)
}
for i, c := range pod.Spec.InitContainers {
vm := v1.VolumeMount{
MountPath: sp.sshKeyFilePath,
SubPath: SSHRelativePath,
Name: secretName,
}
pod.Spec.InitContainers[i].VolumeMounts = append(c.VolumeMounts, vm)
}
}
func generateRsaKey(job *batch.Job) (map[string][]byte, error) {
bitSize := 2048
privateKey, err := rsa.GenerateKey(rand.Reader, bitSize)
if err != nil {
klog.Errorf("rsa generateKey err: %v", err)
return nil, err
}
// id_rsa
privBlock := pem.Block{
Type: "RSA PRIVATE KEY",
Bytes: x509.MarshalPKCS1PrivateKey(privateKey),
}
privateKeyBytes := pem.EncodeToMemory(&privBlock)
// id_rsa.pub
publicRsaKey, err := ssh.NewPublicKey(&privateKey.PublicKey)
if err != nil {
klog.Errorf("ssh newPublicKey err: %v", err)
return nil, err
}
publicKeyBytes := ssh.MarshalAuthorizedKey(publicRsaKey)
data := make(map[string][]byte)
data[SSHPrivateKey] = privateKeyBytes
data[SSHPublicKey] = publicKeyBytes
data[SSHAuthorizedKeys] = publicKeyBytes
data[SSHConfig] = []byte(generateSSHConfig(job))
return data, nil
}
func withUserProvidedRsaKey(job *batch.Job, sshPrivateKey string, sshPublicKey string) (map[string][]byte, error) {
data := make(map[string][]byte)
data[SSHPrivateKey] = []byte(sshPrivateKey)
data[SSHPublicKey] = []byte(sshPublicKey)
data[SSHAuthorizedKeys] = []byte(sshPublicKey)
data[SSHConfig] = []byte(generateSSHConfig(job))
return data, nil
}
func (sp *sshPlugin) secretName(job *batch.Job) string {
return fmt.Sprintf("%s-%s", job.Name, sp.Name())
}
func (sp *sshPlugin) addFlags() {
flagSet := flag.NewFlagSet(sp.Name(), flag.ContinueOnError)
flagSet.StringVar(&sp.sshKeyFilePath, "ssh-key-file-path", sp.sshKeyFilePath, "The path used to store "+
"ssh private and public keys, it is `/root/.ssh` by default.")
flagSet.StringVar(&sp.sshPrivateKey, "ssh-private-key", sp.sshPrivateKey, "The input string of the private key")
flagSet.StringVar(&sp.sshPublicKey, "ssh-public-key", sp.sshPublicKey, "The input string of the public key")
if err := flagSet.Parse(sp.pluginArguments); err != nil {
klog.Errorf("plugin %s flagset parse failed, err: %v", sp.Name(), err)
}
}
func generateSSHConfig(job *batch.Job) string {
config := "StrictHostKeyChecking no\nUserKnownHostsFile /dev/null\n"
for _, ts := range job.Spec.Tasks {
for i := 0; i < int(ts.Replicas); i++ {
hostName := ts.Template.Spec.Hostname
subdomain := ts.Template.Spec.Subdomain
if len(hostName) == 0 {
hostName = jobhelpers.MakePodName(job.Name, ts.Name, i)
}
if len(subdomain) == 0 {
subdomain = job.Name
}
config += "Host " + hostName + "\n"
config += " HostName " + hostName + "." + subdomain + "\n"
if len(ts.Template.Spec.Hostname) != 0 {
break
}
}
}
return config
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podgroup
import (
"k8s.io/apimachinery/pkg/util/wait"
coreinformers "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/framework"
)
func init() {
framework.RegisterController(&pgcontroller{})
}
// pgcontroller the Podgroup pgcontroller type.
type pgcontroller struct {
kubeClient kubernetes.Interface
vcClient vcclientset.Interface
podInformer coreinformers.PodInformer
pgInformer schedulinginformer.PodGroupInformer
// A store of pods
podLister corelisters.PodLister
podSynced func() bool
// A store of podgroups
pgLister schedulinglister.PodGroupLister
pgSynced func() bool
queue workqueue.RateLimitingInterface
schedulerNames []string
}
func (pg *pgcontroller) Name() string {
return "pg-controller"
}
// Initialize create new Podgroup Controller.
func (pg *pgcontroller) Initialize(opt *framework.ControllerOption) error {
pg.kubeClient = opt.KubeClient
pg.vcClient = opt.VolcanoClient
pg.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
pg.schedulerNames = make([]string, len(opt.SchedulerNames))
copy(pg.schedulerNames, opt.SchedulerNames)
pg.podInformer = opt.SharedInformerFactory.Core().V1().Pods()
pg.podLister = pg.podInformer.Lister()
pg.podSynced = pg.podInformer.Informer().HasSynced
pg.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: pg.addPod,
})
pg.pgInformer = informerfactory.NewSharedInformerFactory(pg.vcClient, 0).Scheduling().V1beta1().PodGroups()
pg.pgLister = pg.pgInformer.Lister()
pg.pgSynced = pg.pgInformer.Informer().HasSynced
return nil
}
// Run start NewPodgroupController.
func (pg *pgcontroller) Run(stopCh <-chan struct{}) {
go pg.podInformer.Informer().Run(stopCh)
go pg.pgInformer.Informer().Run(stopCh)
cache.WaitForCacheSync(stopCh, pg.podSynced, pg.pgSynced)
go wait.Until(pg.worker, 0, stopCh)
klog.Infof("PodgroupController is running ...... ")
}
func (pg *pgcontroller) worker() {
for pg.processNextReq() {
}
}
func (pg *pgcontroller) processNextReq() bool {
obj, shutdown := pg.queue.Get()
if shutdown {
klog.Errorf("Fail to pop item from queue")
return false
}
req := obj.(podRequest)
defer pg.queue.Done(req)
pod, err := pg.podLister.Pods(req.podNamespace).Get(req.podName)
if err != nil {
klog.Errorf("Failed to get pod by <%v> from cache: %v", req, err)
return true
}
if !contains(pg.schedulerNames, pod.Spec.SchedulerName) {
klog.V(5).Infof("pod %v/%v field SchedulerName is not matched", pod.Namespace, pod.Name)
return true
}
if pod.Annotations != nil && pod.Annotations[scheduling.KubeGroupNameAnnotationKey] != "" {
klog.V(5).Infof("pod %v/%v has created podgroup", pod.Namespace, pod.Name)
return true
}
// normal pod use volcano
if err := pg.createNormalPodPGIfNotExist(pod); err != nil {
klog.Errorf("Failed to handle Pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
pg.queue.AddRateLimited(req)
return true
}
// If no error, forget it.
pg.queue.Forget(req)
return true
}
func contains(slice []string, element string) bool {
for _, item := range slice {
if item == element {
return true
}
}
return false
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podgroup
import (
"context"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/helpers"
scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
type podRequest struct {
podName string
podNamespace string
}
func (pg *pgcontroller) addPod(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
klog.Errorf("Failed to convert %v to v1.Pod", obj)
return
}
req := podRequest{
podName: pod.Name,
podNamespace: pod.Namespace,
}
pg.queue.Add(req)
}
func (pg *pgcontroller) updatePodAnnotations(pod *v1.Pod, pgName string) error {
if pod.Annotations == nil {
pod.Annotations = make(map[string]string)
}
if pod.Annotations[scheduling.KubeGroupNameAnnotationKey] == "" {
pod.Annotations[scheduling.KubeGroupNameAnnotationKey] = pgName
} else {
if pod.Annotations[scheduling.KubeGroupNameAnnotationKey] != pgName {
klog.Errorf("normal pod %s/%s annotations %s value is not %s, but %s", pod.Namespace, pod.Name,
scheduling.KubeGroupNameAnnotationKey, pgName, pod.Annotations[scheduling.KubeGroupNameAnnotationKey])
}
return nil
}
if _, err := pg.kubeClient.CoreV1().Pods(pod.Namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
klog.Errorf("Failed to update pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
return err
}
return nil
}
func (pg *pgcontroller) createNormalPodPGIfNotExist(pod *v1.Pod) error {
pgName := helpers.GeneratePodgroupName(pod)
if _, err := pg.pgLister.PodGroups(pod.Namespace).Get(pgName); err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to get normal PodGroup for Pod <%s/%s>: %v",
pod.Namespace, pod.Name, err)
return err
}
obj := &scheduling.PodGroup{
ObjectMeta: metav1.ObjectMeta{
Namespace: pod.Namespace,
Name: pgName,
OwnerReferences: newPGOwnerReferences(pod),
Annotations: map[string]string{},
Labels: map[string]string{},
},
Spec: scheduling.PodGroupSpec{
MinMember: 1,
PriorityClassName: pod.Spec.PriorityClassName,
MinResources: calcPGMinResources(pod),
},
}
if queueName, ok := pod.Annotations[scheduling.QueueNameAnnotationKey]; ok {
obj.Spec.Queue = queueName
}
if value, ok := pod.Annotations[scheduling.PodPreemptable]; ok {
obj.Annotations[scheduling.PodPreemptable] = value
}
if value, ok := pod.Annotations[scheduling.RevocableZone]; ok {
obj.Annotations[scheduling.RevocableZone] = value
}
if value, ok := pod.Labels[scheduling.PodPreemptable]; ok {
obj.Labels[scheduling.PodPreemptable] = value
}
if value, found := pod.Annotations[scheduling.JDBMinAvailable]; found {
obj.Annotations[scheduling.JDBMinAvailable] = value
} else if value, found := pod.Annotations[scheduling.JDBMaxUnavailable]; found {
obj.Annotations[scheduling.JDBMaxUnavailable] = value
}
if _, err := pg.vcClient.SchedulingV1beta1().PodGroups(pod.Namespace).Create(context.TODO(), obj, metav1.CreateOptions{}); err != nil {
klog.Errorf("Failed to create normal PodGroup for Pod <%s/%s>: %v",
pod.Namespace, pod.Name, err)
return err
}
}
return pg.updatePodAnnotations(pod, pgName)
}
func newPGOwnerReferences(pod *v1.Pod) []metav1.OwnerReference {
if len(pod.OwnerReferences) != 0 {
for _, ownerReference := range pod.OwnerReferences {
if ownerReference.Controller != nil && *ownerReference.Controller {
return pod.OwnerReferences
}
}
}
gvk := schema.GroupVersionKind{
Group: v1.SchemeGroupVersion.Group,
Version: v1.SchemeGroupVersion.Version,
Kind: "Pod",
}
ref := metav1.NewControllerRef(pod, gvk)
return []metav1.OwnerReference{*ref}
}
// addResourceList add list resource quantity
func addResourceList(list, req, limit v1.ResourceList) {
for name, quantity := range req {
if value, ok := list[name]; !ok {
list[name] = quantity.DeepCopy()
} else {
value.Add(quantity)
list[name] = value
}
}
if req != nil {
return
}
// If Requests is omitted for a container,
// it defaults to Limits if that is explicitly specified.
for name, quantity := range limit {
if value, ok := list[name]; !ok {
list[name] = quantity.DeepCopy()
} else {
value.Add(quantity)
list[name] = value
}
}
}
// calcPGMinResources calculate podgroup minimum resource
func calcPGMinResources(pod *v1.Pod) *v1.ResourceList {
pgMinRes := v1.ResourceList{}
for _, c := range pod.Spec.Containers {
addResourceList(pgMinRes, c.Resources.Requests, c.Resources.Limits)
}
return &pgMinRes
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
versionedscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
busv1alpha1informer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
busv1alpha1lister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
"volcano.sh/volcano/pkg/controllers/framework"
queuestate "volcano.sh/volcano/pkg/controllers/queue/state"
)
func init() {
framework.RegisterController(&queuecontroller{})
}
// queuecontroller manages queue status.
type queuecontroller struct {
kubeClient kubernetes.Interface
vcClient vcclientset.Interface
// informer
queueInformer schedulinginformer.QueueInformer
pgInformer schedulinginformer.PodGroupInformer
// queueLister
queueLister schedulinglister.QueueLister
queueSynced cache.InformerSynced
// podGroup lister
pgLister schedulinglister.PodGroupLister
pgSynced cache.InformerSynced
cmdInformer busv1alpha1informer.CommandInformer
cmdLister busv1alpha1lister.CommandLister
cmdSynced cache.InformerSynced
// queues that need to be updated.
queue workqueue.RateLimitingInterface
commandQueue workqueue.RateLimitingInterface
pgMutex sync.RWMutex
// queue name -> podgroup namespace/name
podGroups map[string]map[string]struct{}
syncHandler func(req *apis.Request) error
syncCommandHandler func(cmd *busv1alpha1.Command) error
enqueueQueue func(req *apis.Request)
recorder record.EventRecorder
maxRequeueNum int
}
func (c *queuecontroller) Name() string {
return "queue-controller"
}
// NewQueueController creates a QueueController.
func (c *queuecontroller) Initialize(opt *framework.ControllerOption) error {
c.vcClient = opt.VolcanoClient
c.kubeClient = opt.KubeClient
factory := informerfactory.NewSharedInformerFactory(c.vcClient, 0)
queueInformer := factory.Scheduling().V1beta1().Queues()
pgInformer := factory.Scheduling().V1beta1().PodGroups()
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(klog.Infof)
eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: c.kubeClient.CoreV1().Events("")})
c.queueInformer = queueInformer
c.pgInformer = pgInformer
c.queueLister = queueInformer.Lister()
c.queueSynced = queueInformer.Informer().HasSynced
c.pgLister = pgInformer.Lister()
c.pgSynced = pgInformer.Informer().HasSynced
c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
c.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
c.podGroups = make(map[string]map[string]struct{})
c.recorder = eventBroadcaster.NewRecorder(versionedscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
c.maxRequeueNum = opt.MaxRequeueNum
if c.maxRequeueNum < 0 {
c.maxRequeueNum = -1
}
queueInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: c.addQueue,
UpdateFunc: c.updateQueue,
DeleteFunc: c.deleteQueue,
})
pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: c.addPodGroup,
UpdateFunc: c.updatePodGroup,
DeleteFunc: c.deletePodGroup,
})
c.cmdInformer = informerfactory.NewSharedInformerFactory(c.vcClient, 0).Bus().V1alpha1().Commands()
c.cmdInformer.Informer().AddEventHandler(cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch v := obj.(type) {
case *busv1alpha1.Command:
return IsQueueReference(v.TargetObject)
default:
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: c.addCommand,
},
})
c.cmdLister = c.cmdInformer.Lister()
c.cmdSynced = c.cmdInformer.Informer().HasSynced
queuestate.SyncQueue = c.syncQueue
queuestate.OpenQueue = c.openQueue
queuestate.CloseQueue = c.closeQueue
c.syncHandler = c.handleQueue
c.syncCommandHandler = c.handleCommand
c.enqueueQueue = c.enqueue
return nil
}
// Run starts QueueController.
func (c *queuecontroller) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()
defer c.commandQueue.ShutDown()
klog.Infof("Starting queue controller.")
defer klog.Infof("Shutting down queue controller.")
go c.queueInformer.Informer().Run(stopCh)
go c.pgInformer.Informer().Run(stopCh)
go c.cmdInformer.Informer().Run(stopCh)
if !cache.WaitForCacheSync(stopCh, c.queueSynced, c.pgSynced, c.cmdSynced) {
klog.Errorf("unable to sync caches for queue controller.")
return
}
go wait.Until(c.worker, 0, stopCh)
go wait.Until(c.commandWorker, 0, stopCh)
<-stopCh
}
// worker runs a worker thread that just dequeues items, processes them, and
// marks them done. You may run as many of these in parallel as you wish; the
// workqueue guarantees that they will not end up processing the same `queue`
// at the same time.
func (c *queuecontroller) worker() {
for c.processNextWorkItem() {
}
}
func (c *queuecontroller) processNextWorkItem() bool {
obj, shutdown := c.queue.Get()
if shutdown {
return false
}
defer c.queue.Done(obj)
req, ok := obj.(*apis.Request)
if !ok {
klog.Errorf("%v is not a valid queue request struct.", obj)
return true
}
err := c.syncHandler(req)
c.handleQueueErr(err, obj)
return true
}
func (c *queuecontroller) handleQueue(req *apis.Request) error {
startTime := time.Now()
defer func() {
klog.V(4).Infof("Finished syncing queue %s (%v).", req.QueueName, time.Since(startTime))
}()
queue, err := c.queueLister.Get(req.QueueName)
if err != nil {
if apierrors.IsNotFound(err) {
klog.V(4).Infof("Queue %s has been deleted.", req.QueueName)
return nil
}
return fmt.Errorf("get queue %s failed for %v", req.QueueName, err)
}
queueState := queuestate.NewState(queue)
if queueState == nil {
return fmt.Errorf("queue %s state %s is invalid", queue.Name, queue.Status.State)
}
klog.V(4).Infof("Begin execute %s action for queue %s, current status %s", req.Action, req.QueueName, queue.Status.State)
if err := queueState.Execute(req.Action); err != nil {
return fmt.Errorf("sync queue %s failed for %v, event is %v, action is %s",
req.QueueName, err, req.Event, req.Action)
}
return nil
}
func (c *queuecontroller) handleQueueErr(err error, obj interface{}) {
if err == nil {
c.queue.Forget(obj)
return
}
if c.maxRequeueNum == -1 || c.queue.NumRequeues(obj) < c.maxRequeueNum {
klog.V(4).Infof("Error syncing queue request %v for %v.", obj, err)
c.queue.AddRateLimited(obj)
return
}
req, _ := obj.(*apis.Request)
c.recordEventsForQueue(req.QueueName, v1.EventTypeWarning, string(req.Action),
fmt.Sprintf("%v queue failed for %v", req.Action, err))
klog.V(2).Infof("Dropping queue request %v out of the queue for %v.", obj, err)
c.queue.Forget(obj)
}
func (c *queuecontroller) commandWorker() {
for c.processNextCommand() {
}
}
func (c *queuecontroller) processNextCommand() bool {
obj, shutdown := c.commandQueue.Get()
if shutdown {
return false
}
defer c.commandQueue.Done(obj)
cmd, ok := obj.(*busv1alpha1.Command)
if !ok {
klog.Errorf("%v is not a valid Command struct.", obj)
return true
}
err := c.syncCommandHandler(cmd)
c.handleCommandErr(err, obj)
return true
}
func (c *queuecontroller) handleCommand(cmd *busv1alpha1.Command) error {
startTime := time.Now()
defer func() {
klog.V(4).Infof("Finished syncing command %s/%s (%v).", cmd.Namespace, cmd.Name, time.Since(startTime))
}()
err := c.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
return nil
}
return fmt.Errorf("failed to delete command <%s/%s> for %v", cmd.Namespace, cmd.Name, err)
}
req := &apis.Request{
QueueName: cmd.TargetObject.Name,
Event: busv1alpha1.CommandIssuedEvent,
Action: busv1alpha1.Action(cmd.Action),
}
c.enqueueQueue(req)
return nil
}
func (c *queuecontroller) handleCommandErr(err error, obj interface{}) {
if err == nil {
c.commandQueue.Forget(obj)
return
}
if c.maxRequeueNum == -1 || c.commandQueue.NumRequeues(obj) < c.maxRequeueNum {
klog.V(4).Infof("Error syncing command %v for %v.", obj, err)
c.commandQueue.AddRateLimited(obj)
return
}
klog.V(2).Infof("Dropping command %v out of the queue for %v.", obj, err)
c.commandQueue.Forget(obj)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"fmt"
"reflect"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/queue/state"
)
func (c *queuecontroller) syncQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
klog.V(4).Infof("Begin to sync queue %s.", queue.Name)
defer klog.V(4).Infof("End sync queue %s.", queue.Name)
podGroups := c.getPodGroups(queue.Name)
queueStatus := schedulingv1beta1.QueueStatus{}
for _, pgKey := range podGroups {
// Ignore error here, tt can not occur.
ns, name, _ := cache.SplitMetaNamespaceKey(pgKey)
// TODO: check NotFound error and sync local cache.
pg, err := c.pgLister.PodGroups(ns).Get(name)
if err != nil {
return err
}
switch pg.Status.Phase {
case schedulingv1beta1.PodGroupPending:
queueStatus.Pending++
case schedulingv1beta1.PodGroupRunning:
queueStatus.Running++
case schedulingv1beta1.PodGroupUnknown:
queueStatus.Unknown++
case schedulingv1beta1.PodGroupInqueue:
queueStatus.Inqueue++
}
}
if updateStateFn != nil {
updateStateFn(&queueStatus, podGroups)
} else {
queueStatus.State = queue.Status.State
}
// ignore update when status does not change
if reflect.DeepEqual(queueStatus, queue.Status) {
return nil
}
newQueue := queue.DeepCopy()
newQueue.Status = queueStatus
if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
klog.Errorf("Failed to update status of Queue %s: %v.", newQueue.Name, err)
return err
}
return nil
}
func (c *queuecontroller) openQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
klog.V(4).Infof("Begin to open queue %s.", queue.Name)
newQueue := queue.DeepCopy()
newQueue.Status.State = schedulingv1beta1.QueueStateOpen
if queue.Status.State != newQueue.Status.State {
if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
fmt.Sprintf("Open queue failed for %v", err))
return err
}
c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.OpenQueueAction), "Open queue succeed")
} else {
return nil
}
q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
if err != nil {
return err
}
newQueue = q.DeepCopy()
if updateStateFn != nil {
updateStateFn(&newQueue.Status, nil)
} else {
return fmt.Errorf("internal error, update state function should be provided")
}
if queue.Status.State != newQueue.Status.State {
if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
fmt.Sprintf("Update queue status from %s to %s failed for %v",
queue.Status.State, newQueue.Status.State, err))
return err
}
}
return nil
}
func (c *queuecontroller) closeQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
klog.V(4).Infof("Begin to close queue %s.", queue.Name)
newQueue := queue.DeepCopy()
newQueue.Status.State = schedulingv1beta1.QueueStateClosed
if queue.Status.State != newQueue.Status.State {
if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
fmt.Sprintf("Close queue failed for %v", err))
return err
}
c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.CloseQueueAction), "Close queue succeed")
} else {
return nil
}
q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
if err != nil {
return err
}
newQueue = q.DeepCopy()
podGroups := c.getPodGroups(newQueue.Name)
if updateStateFn != nil {
updateStateFn(&newQueue.Status, podGroups)
} else {
return fmt.Errorf("internal error, update state function should be provided")
}
if queue.Status.State != newQueue.Status.State {
if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
fmt.Sprintf("Update queue status from %s to %s failed for %v",
queue.Status.State, newQueue.Status.State, err))
return err
}
}
return nil
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
)
func (c *queuecontroller) enqueue(req *apis.Request) {
c.queue.Add(req)
}
func (c *queuecontroller) addQueue(obj interface{}) {
queue := obj.(*schedulingv1beta1.Queue)
req := &apis.Request{
QueueName: queue.Name,
Event: busv1alpha1.OutOfSyncEvent,
Action: busv1alpha1.SyncQueueAction,
}
c.enqueue(req)
}
func (c *queuecontroller) deleteQueue(obj interface{}) {
queue, ok := obj.(*schedulingv1beta1.Queue)
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Couldn't get object from tombstone %#v.", obj)
return
}
queue, ok = tombstone.Obj.(*schedulingv1beta1.Queue)
if !ok {
klog.Errorf("Tombstone contained object that is not a Queue: %#v.", obj)
return
}
}
c.pgMutex.Lock()
defer c.pgMutex.Unlock()
delete(c.podGroups, queue.Name)
}
func (c *queuecontroller) updateQueue(_, _ interface{}) {
// currently do not care about queue update
}
func (c *queuecontroller) addPodGroup(obj interface{}) {
pg := obj.(*schedulingv1beta1.PodGroup)
key, _ := cache.MetaNamespaceKeyFunc(obj)
c.pgMutex.Lock()
defer c.pgMutex.Unlock()
if c.podGroups[pg.Spec.Queue] == nil {
c.podGroups[pg.Spec.Queue] = make(map[string]struct{})
}
c.podGroups[pg.Spec.Queue][key] = struct{}{}
req := &apis.Request{
QueueName: pg.Spec.Queue,
Event: busv1alpha1.OutOfSyncEvent,
Action: busv1alpha1.SyncQueueAction,
}
c.enqueue(req)
}
func (c *queuecontroller) updatePodGroup(old, new interface{}) {
oldPG := old.(*schedulingv1beta1.PodGroup)
newPG := new.(*schedulingv1beta1.PodGroup)
// Note: we have no use case update PodGroup.Spec.Queue
// So do not consider it here.
if oldPG.Status.Phase != newPG.Status.Phase {
c.addPodGroup(newPG)
}
}
func (c *queuecontroller) deletePodGroup(obj interface{}) {
pg, ok := obj.(*schedulingv1beta1.PodGroup)
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Couldn't get object from tombstone %#v.", obj)
return
}
pg, ok = tombstone.Obj.(*schedulingv1beta1.PodGroup)
if !ok {
klog.Errorf("Tombstone contained object that is not a PodGroup: %#v.", obj)
return
}
}
key, _ := cache.MetaNamespaceKeyFunc(obj)
c.pgMutex.Lock()
defer c.pgMutex.Unlock()
delete(c.podGroups[pg.Spec.Queue], key)
req := &apis.Request{
QueueName: pg.Spec.Queue,
Event: busv1alpha1.OutOfSyncEvent,
Action: busv1alpha1.SyncQueueAction,
}
c.enqueue(req)
}
func (c *queuecontroller) addCommand(obj interface{}) {
cmd, ok := obj.(*busv1alpha1.Command)
if !ok {
klog.Errorf("Obj %v is not command.", obj)
return
}
c.commandQueue.Add(cmd)
}
func (c *queuecontroller) getPodGroups(key string) []string {
c.pgMutex.RLock()
defer c.pgMutex.RUnlock()
if c.podGroups[key] == nil {
return nil
}
podGroups := make([]string, 0, len(c.podGroups[key]))
for pgKey := range c.podGroups[key] {
podGroups = append(podGroups, pgKey)
}
return podGroups
}
func (c *queuecontroller) recordEventsForQueue(name, eventType, reason, message string) {
queue, err := c.queueLister.Get(name)
if err != nil {
klog.Errorf("Get queue %s failed for %v.", name, err)
return
}
c.recorder.Event(queue, eventType, reason, message)
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
// IsQueueReference return if ownerReference is Queue Kind.
func IsQueueReference(ref *metav1.OwnerReference) bool {
if ref == nil {
return false
}
if ref.APIVersion != schedulingv1beta1.SchemeGroupVersion.String() {
return false
}
if ref.Kind != "Queue" {
return false
}
return true
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package allocate
import (
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/metrics"
"volcano.sh/volcano/pkg/scheduler/util"
)
var targetJob = util.Reservation.TargetJob
type Action struct{}
func New() *Action {
return &Action{}
}
func (alloc *Action) Name() string {
return "allocate"
}
func (alloc *Action) Initialize() {}
func (alloc *Action) Execute(ssn *framework.Session) {
klog.V(3).Infof("Enter Allocate ...")
defer klog.V(3).Infof("Leaving Allocate ...")
// the allocation for pod may have many stages
// 1. pick a namespace named N (using ssn.NamespaceOrderFn)
// 2. pick a queue named Q from N (using ssn.QueueOrderFn)
// 3. pick a job named J from Q (using ssn.JobOrderFn)
// 4. pick a task T from J (using ssn.TaskOrderFn)
// 5. use predicateFn to filter out node that T can not be allocated on.
// 6. use ssn.NodeOrderFn to judge the best node and assign it to T
namespaces := util.NewPriorityQueue(ssn.NamespaceOrderFn)
// jobsMap is map[api.NamespaceName]map[api.QueueID]PriorityQueue(*api.JobInfo)
// used to find job with highest priority in given queue and namespace
jobsMap := map[api.NamespaceName]map[api.QueueID]*util.PriorityQueue{}
for _, job := range ssn.Jobs {
if job.IsPending() {
klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: job status is pending.",
job.Namespace, job.Name, job.Queue)
continue
}
if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
continue
}
if _, found := ssn.Queues[job.Queue]; !found {
klog.Warningf("Skip adding Job <%s/%s> because its queue %s is not found",
job.Namespace, job.Name, job.Queue)
continue
}
namespace := api.NamespaceName(job.Namespace)
queueMap, found := jobsMap[namespace]
if !found {
namespaces.Push(namespace)
queueMap = make(map[api.QueueID]*util.PriorityQueue)
jobsMap[namespace] = queueMap
}
jobs, found := queueMap[job.Queue]
if !found {
jobs = util.NewPriorityQueue(ssn.JobOrderFn)
queueMap[job.Queue] = jobs
}
klog.V(4).Infof("Added Job <%s/%s> into Queue <%s>", job.Namespace, job.Name, job.Queue)
jobs.Push(job)
}
klog.V(3).Infof("Try to allocate resource to %d Namespaces", len(jobsMap))
pendingTasks := map[api.JobID]*util.PriorityQueue{}
allNodes := ssn.NodeList
unlockedNodes := allNodes
if targetJob != nil && len(util.Reservation.LockedNodes) != 0 {
unlockedNodes = unlockedNodes[0:0]
for _, node := range allNodes {
if _, exist := util.Reservation.LockedNodes[node.Name]; !exist {
unlockedNodes = append(unlockedNodes, node)
}
}
}
for _, unlockedNode := range unlockedNodes {
klog.V(4).Infof("unlockedNode ID: %s, Name: %s", unlockedNode.Node.UID, unlockedNode.Node.Name)
}
predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) error {
// Check for Resource Predicate
if !task.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
return api.NewFitError(task, node, api.NodeResourceFitFailed)
}
return ssn.PredicateFn(task, node)
}
// To pick <namespace, queue> tuple for job, we choose to pick namespace firstly.
// Because we believe that number of queues would less than namespaces in most case.
// And, this action would make the resource usage among namespace balanced.
for {
if namespaces.Empty() {
break
}
// pick namespace from namespaces PriorityQueue
namespace := namespaces.Pop().(api.NamespaceName)
queueInNamespace := jobsMap[namespace]
// pick queue for given namespace
//
// This block use an algorithm with time complex O(n).
// But at least PriorityQueue could not be used here,
// because the allocation of job would change the priority of queue among all namespaces,
// and the PriorityQueue have no ability to update priority for a special queue.
var queue *api.QueueInfo
for queueID := range queueInNamespace {
currentQueue := ssn.Queues[queueID]
if ssn.Overused(currentQueue) {
klog.V(3).Infof("Namespace <%s> Queue <%s> is overused, ignore it.", namespace, currentQueue.Name)
delete(queueInNamespace, queueID)
continue
}
if jobs, found := queueInNamespace[currentQueue.UID]; found && jobs.Empty() {
continue
}
if queue == nil || ssn.QueueOrderFn(currentQueue, queue) {
queue = currentQueue
}
}
if queue == nil {
klog.V(3).Infof("Namespace <%s> have no queue, skip it", namespace)
continue
}
klog.V(3).Infof("Try to allocate resource to Jobs in Namespace <%s> Queue <%v>", namespace, queue.Name)
jobs, found := queueInNamespace[queue.UID]
if !found || jobs.Empty() {
delete(queueInNamespace, queue.UID)
namespaces.Push(namespace)
klog.V(4).Infof("Can not find jobs for queue %s.", queue.Name)
continue
}
job := jobs.Pop().(*api.JobInfo)
var nodes []*api.NodeInfo
if targetJob != nil && job.UID == targetJob.UID {
klog.V(4).Infof("Try to allocate resource to target job: %s", job.Name)
nodes = allNodes
} else {
nodes = unlockedNodes
}
if _, found = pendingTasks[job.UID]; !found {
tasks := util.NewPriorityQueue(ssn.TaskOrderFn)
for _, task := range job.TaskStatusIndex[api.Pending] {
// Skip BestEffort task in 'allocate' action.
if task.Resreq.IsEmpty() {
klog.V(4).Infof("Task <%v/%v> is BestEffort task, skip it.",
task.Namespace, task.Name)
continue
}
tasks.Push(task)
}
pendingTasks[job.UID] = tasks
}
tasks := pendingTasks[job.UID]
klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>",
tasks.Len(), job.Namespace, job.Name)
stmt := framework.NewStatement(ssn)
ph := util.NewPredicateHelper()
for !tasks.Empty() {
task := tasks.Pop().(*api.TaskInfo)
// Check whether the queue is overused on dimension that the task requested
taskRequest := task.Resreq.ResourceNames()
if underusedResources := ssn.UnderusedResources(queue); underusedResources != nil && !underusedResources.Contains(taskRequest) {
klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
continue
}
klog.V(3).Infof("There are <%d> nodes for Job <%v/%v>", len(nodes), job.Namespace, job.Name)
predicateNodes, fitErrors := ph.PredicateNodes(task, nodes, predicateFn)
if len(predicateNodes) == 0 {
job.NodesFitErrors[task.UID] = fitErrors
break
}
var candidateNodes []*api.NodeInfo
for _, n := range predicateNodes {
if task.InitResreq.LessEqual(n.Idle, api.Zero) || task.InitResreq.LessEqual(n.FutureIdle(), api.Zero) {
candidateNodes = append(candidateNodes, n)
}
}
// If not candidate nodes for this task, skip it.
if len(candidateNodes) == 0 {
continue
}
nodeScores := util.PrioritizeNodes(task, candidateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
node := ssn.BestNodeFn(task, nodeScores)
if node == nil {
node = util.SelectBestNode(nodeScores)
}
// Allocate idle resource to the task.
if task.InitResreq.LessEqual(node.Idle, api.Zero) {
klog.V(3).Infof("Binding Task <%v/%v> to node <%v>",
task.Namespace, task.Name, node.Name)
if err := stmt.Allocate(task, node); err != nil {
klog.Errorf("Failed to bind Task %v on %v in Session %v, err: %v",
task.UID, node.Name, ssn.UID, err)
} else {
metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
}
} else {
klog.V(3).Infof("Predicates failed for task <%s/%s> on node <%s> with limited resources",
task.Namespace, task.Name, node.Name)
// Allocate releasing resource to the task if any.
if task.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
klog.V(3).Infof("Pipelining Task <%v/%v> to node <%v> for <%v> on <%v>",
task.Namespace, task.Name, node.Name, task.InitResreq, node.Releasing)
if err := stmt.Pipeline(task, node.Name); err != nil {
klog.Errorf("Failed to pipeline Task %v on %v in Session %v for %v.",
task.UID, node.Name, ssn.UID, err)
} else {
metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
}
}
}
if ssn.JobReady(job) && !tasks.Empty() {
jobs.Push(job)
break
}
}
if ssn.JobReady(job) {
stmt.Commit()
} else {
if !ssn.JobPipelined(job) {
stmt.Discard()
}
}
// Added Namespace back until no job in Namespace.
namespaces.Push(namespace)
}
}
func (alloc *Action) UnInitialize() {}
// Package elect is used to find the target job and reserve resource for it
package elect
import (
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/util"
)
// Action defines the action
type Action struct{}
// New returns the action instance
func New() *Action {
return &Action{}
}
// Name returns the action name
func (alloc *Action) Name() string {
return "elect"
}
// Initialize inits the action
func (alloc *Action) Initialize() {}
// Execute selects the target job which is of the highest priority and waits for the longest time.
func (alloc *Action) Execute(ssn *framework.Session) {
klog.V(3).Infof("Enter Elect ...")
defer klog.V(3).Infof("Leaving Elect ...")
if util.Reservation.TargetJob == nil {
klog.V(4).Infof("Start select Target Job")
var pendingJobs []*api.JobInfo
for _, job := range ssn.Jobs {
if job.PodGroup.Status.Phase == scheduling.PodGroupPending {
pendingJobs = append(pendingJobs, job)
}
}
util.Reservation.TargetJob = ssn.TargetJob(pendingJobs)
if util.Reservation.TargetJob != nil {
klog.V(3).Infof("Target Job name: %s", util.Reservation.TargetJob.Name)
} else {
klog.V(3).Infof("Target Job name: nil")
}
}
}
// UnInitialize releases resource which are not useful.
func (alloc *Action) UnInitialize() {}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package preempt
import (
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/metrics"
"volcano.sh/volcano/pkg/scheduler/util"
)
type Action struct{}
func New() *Action {
return &Action{}
}
func (alloc *Action) Name() string {
return "preempt"
}
func (alloc *Action) Initialize() {}
func (alloc *Action) Execute(ssn *framework.Session) {
klog.V(3).Infof("Enter Preempt ...")
defer klog.V(3).Infof("Leaving Preempt ...")
preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
preemptorTasks := map[api.JobID]*util.PriorityQueue{}
var underRequest []*api.JobInfo
queues := map[api.QueueID]*api.QueueInfo{}
for _, job := range ssn.Jobs {
if job.IsPending() {
continue
}
if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
klog.V(4).Infof("Job <%s/%s> Queue <%s> skip preemption, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
continue
}
if queue, found := ssn.Queues[job.Queue]; !found {
continue
} else if _, existed := queues[queue.UID]; !existed {
klog.V(3).Infof("Added Queue <%s> for Job <%s/%s>",
queue.Name, job.Namespace, job.Name)
queues[queue.UID] = queue
}
// check job if starting for more resources.
if ssn.JobStarving(job) {
if _, found := preemptorsMap[job.Queue]; !found {
preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
}
preemptorsMap[job.Queue].Push(job)
underRequest = append(underRequest, job)
preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
for _, task := range job.TaskStatusIndex[api.Pending] {
preemptorTasks[job.UID].Push(task)
}
}
}
ph := util.NewPredicateHelper()
// Preemption between Jobs within Queue.
for _, queue := range queues {
for {
preemptors := preemptorsMap[queue.UID]
// If no preemptors, no preemption.
if preemptors == nil || preemptors.Empty() {
klog.V(4).Infof("No preemptors in Queue <%s>, break.", queue.Name)
break
}
preemptorJob := preemptors.Pop().(*api.JobInfo)
stmt := framework.NewStatement(ssn)
assigned := false
for {
// If job is not request more resource, then stop preempting.
if !ssn.JobStarving(preemptorJob) {
break
}
// If not preemptor tasks, next job.
if preemptorTasks[preemptorJob.UID].Empty() {
klog.V(3).Infof("No preemptor task in job <%s/%s>.",
preemptorJob.Namespace, preemptorJob.Name)
break
}
preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo)
if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
// Ignore non running task.
if task.Status != api.Running {
return false
}
// Ignore task with empty resource request.
if task.Resreq.IsEmpty() {
return false
}
job, found := ssn.Jobs[task.Job]
if !found {
return false
}
// Preempt other jobs within queue
return job.Queue == preemptorJob.Queue && preemptor.Job != task.Job
}, ph); preempted {
assigned = true
}
}
// Commit changes only if job is pipelined, otherwise try next job.
if ssn.JobPipelined(preemptorJob) {
stmt.Commit()
} else {
stmt.Discard()
continue
}
if assigned {
preemptors.Push(preemptorJob)
}
}
// Preemption between Task within Job.
for _, job := range underRequest {
// Fix: preemptor numbers lose when in same job
preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
for _, task := range job.TaskStatusIndex[api.Pending] {
preemptorTasks[job.UID].Push(task)
}
for {
if _, found := preemptorTasks[job.UID]; !found {
break
}
if preemptorTasks[job.UID].Empty() {
break
}
preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo)
stmt := framework.NewStatement(ssn)
assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
// Ignore non running task.
if task.Status != api.Running {
return false
}
// Ignore task with empty resource request.
if task.Resreq.IsEmpty() {
return false
}
// Preempt tasks within job.
return preemptor.Job == task.Job
}, ph)
stmt.Commit()
// If no preemption, next job.
if !assigned {
break
}
}
}
}
// call victimTasksFn to evict tasks
victimTasks(ssn)
}
func (alloc *Action) UnInitialize() {}
func preempt(
ssn *framework.Session,
stmt *framework.Statement,
preemptor *api.TaskInfo,
filter func(*api.TaskInfo) bool,
predicateHelper util.PredicateHelper,
) (bool, error) {
assigned := false
allNodes := ssn.NodeList
predicateNodes, _ := predicateHelper.PredicateNodes(preemptor, allNodes, ssn.PredicateFn)
nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
selectedNodes := util.SortNodes(nodeScores)
for _, node := range selectedNodes {
klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
preemptor.Namespace, preemptor.Name, node.Name)
var preemptees []*api.TaskInfo
for _, task := range node.Tasks {
if filter == nil {
preemptees = append(preemptees, task.Clone())
} else if filter(task) {
preemptees = append(preemptees, task.Clone())
}
}
victims := ssn.Preemptable(preemptor, preemptees)
metrics.UpdatePreemptionVictimsCount(len(victims))
if err := util.ValidateVictims(preemptor, node, victims); err != nil {
klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err)
continue
}
victimsQueue := util.NewPriorityQueue(func(l, r interface{}) bool {
return !ssn.TaskOrderFn(l, r)
})
for _, victim := range victims {
victimsQueue.Push(victim)
}
// Preempt victims for tasks, pick lowest priority task first.
preempted := api.EmptyResource()
for !victimsQueue.Empty() {
// If reclaimed enough resources, break loop to avoid Sub panic.
if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
break
}
preemptee := victimsQueue.Pop().(*api.TaskInfo)
klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>",
preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name)
if err := stmt.Evict(preemptee, "preempt"); err != nil {
klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v",
preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err)
continue
}
preempted.Add(preemptee.Resreq)
}
metrics.RegisterPreemptionAttempts()
klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.",
preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq)
if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
if err := stmt.Pipeline(preemptor, node.Name); err != nil {
klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
preemptor.Namespace, preemptor.Name, node.Name)
}
// Ignore pipeline error, will be corrected in next scheduling loop.
assigned = true
break
}
}
return assigned, nil
}
func victimTasks(ssn *framework.Session) {
stmt := framework.NewStatement(ssn)
victimTasks := ssn.VictimTasks()
for _, victim := range victimTasks {
if err := stmt.Evict(victim.Clone(), "evict"); err != nil {
klog.Errorf("Failed to evict Task <%s/%s>: %v",
victim.Namespace, victim.Name, err)
continue
}
}
stmt.Commit()
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package reclaim
import (
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/util"
)
type Action struct{}
func New() *Action {
return &Action{}
}
func (ra *Action) Name() string {
return "reclaim"
}
func (ra *Action) Initialize() {}
func (ra *Action) Execute(ssn *framework.Session) {
klog.V(3).Infof("Enter Reclaim ...")
defer klog.V(3).Infof("Leaving Reclaim ...")
queues := util.NewPriorityQueue(ssn.QueueOrderFn)
queueMap := map[api.QueueID]*api.QueueInfo{}
preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
preemptorTasks := map[api.JobID]*util.PriorityQueue{}
klog.V(3).Infof("There are <%d> Jobs and <%d> Queues in total for scheduling.",
len(ssn.Jobs), len(ssn.Queues))
for _, job := range ssn.Jobs {
if job.IsPending() {
continue
}
if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
klog.V(4).Infof("Job <%s/%s> Queue <%s> skip reclaim, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
continue
}
if queue, found := ssn.Queues[job.Queue]; !found {
klog.Errorf("Failed to find Queue <%s> for Job <%s/%s>",
job.Queue, job.Namespace, job.Name)
continue
} else if _, existed := queueMap[queue.UID]; !existed {
klog.V(4).Infof("Added Queue <%s> for Job <%s/%s>", queue.Name, job.Namespace, job.Name)
queueMap[queue.UID] = queue
queues.Push(queue)
}
if len(job.TaskStatusIndex[api.Pending]) != 0 {
if _, found := preemptorsMap[job.Queue]; !found {
preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
}
preemptorsMap[job.Queue].Push(job)
preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
for _, task := range job.TaskStatusIndex[api.Pending] {
preemptorTasks[job.UID].Push(task)
}
}
}
for {
// If no queues, break
if queues.Empty() {
break
}
var job *api.JobInfo
var task *api.TaskInfo
queue := queues.Pop().(*api.QueueInfo)
if ssn.Overused(queue) {
klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name)
continue
}
// Found "high" priority job
jobs, found := preemptorsMap[queue.UID]
if !found || jobs.Empty() {
continue
} else {
job = jobs.Pop().(*api.JobInfo)
}
// Found "high" priority task to reclaim others
if tasks, found := preemptorTasks[job.UID]; !found || tasks.Empty() {
continue
} else {
task = tasks.Pop().(*api.TaskInfo)
}
// Check whether the queue is overused on dimension that the task requested
taskRequest := task.Resreq.ResourceNames()
if underusedResources := ssn.UnderusedResources(queue); underusedResources != nil && !underusedResources.Contains(taskRequest) {
klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
continue
}
assigned := false
for _, n := range ssn.Nodes {
// If predicates failed, next node.
if err := ssn.PredicateFn(task, n); err != nil {
continue
}
klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
task.Namespace, task.Name, n.Name)
var reclaimees []*api.TaskInfo
for _, task := range n.Tasks {
// Ignore non running task.
if task.Status != api.Running {
continue
}
if j, found := ssn.Jobs[task.Job]; !found {
continue
} else if j.Queue != job.Queue {
q := ssn.Queues[j.Queue]
if !q.Reclaimable() {
continue
}
// Clone task to avoid modify Task's status on node.
reclaimees = append(reclaimees, task.Clone())
}
}
victims := ssn.Reclaimable(task, reclaimees)
if err := util.ValidateVictims(task, n, victims); err != nil {
klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err)
continue
}
resreq := task.InitResreq.Clone()
reclaimed := api.EmptyResource()
// Reclaim victims for tasks.
for _, reclaimee := range victims {
klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>",
reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name)
if err := ssn.Evict(reclaimee, "reclaim"); err != nil {
klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v",
reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err)
continue
}
reclaimed.Add(reclaimee.Resreq)
// If reclaimed enough resources, break loop to avoid Sub panic.
if resreq.LessEqual(reclaimed, api.Zero) {
break
}
}
klog.V(3).Infof("Reclaimed <%v> for task <%s/%s> requested <%v>.",
reclaimed, task.Namespace, task.Name, task.InitResreq)
if task.InitResreq.LessEqual(reclaimed, api.Zero) {
if err := ssn.Pipeline(task, n.Name); err != nil {
klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
task.Namespace, task.Name, n.Name)
}
// Ignore error of pipeline, will be corrected in next scheduling loop.
assigned = true
break
}
}
if assigned {
jobs.Push(job)
}
queues.Push(queue)
}
}
func (ra *Action) UnInitialize() {
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"fmt"
)
// ClusterInfo is a snapshot of cluster by cache.
type ClusterInfo struct {
Jobs map[JobID]*JobInfo
Nodes map[string]*NodeInfo
Queues map[QueueID]*QueueInfo
NamespaceInfo map[NamespaceName]*NamespaceInfo
RevocableNodes map[string]*NodeInfo
NodeList []string
}
func (ci ClusterInfo) String() string {
str := "Cache:\n"
if len(ci.Nodes) != 0 {
str += "Nodes:\n"
for _, n := range ci.Nodes {
str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
i := 0
for _, p := range n.Tasks {
str += fmt.Sprintf("\t\t %d: %v\n", i, p)
i++
}
}
}
if len(ci.Jobs) != 0 {
str += "Jobs:\n"
for _, job := range ci.Jobs {
str += fmt.Sprintf("\t Job(%s) name(%s) minAvailable(%v)\n",
job.UID, job.Name, job.MinAvailable)
i := 0
for _, task := range job.Tasks {
str += fmt.Sprintf("\t\t %d: %v\n", i, task)
i++
}
}
}
if len(ci.NamespaceInfo) != 0 {
str += "Namespaces:\n"
for _, ns := range ci.NamespaceInfo {
str += fmt.Sprintf("\t Namespace(%s) Weight(%v)\n",
ns.Name, ns.Weight)
}
}
if len(ci.NodeList) != 0 {
str += fmt.Sprintf("NodeList: %v\n", ci.NodeList)
}
return str
}
/*
Copyright 2020 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
v1 "k8s.io/api/core/v1"
)
// GPUDevice include gpu id, memory and the pods that are sharing it.
type GPUDevice struct {
// GPU ID
ID int
// The pods that are sharing this GPU
PodMap map[string]*v1.Pod
// memory per card
Memory uint
}
// NewGPUDevice creates a device
func NewGPUDevice(id int, mem uint) *GPUDevice {
return &GPUDevice{
ID: id,
Memory: mem,
PodMap: map[string]*v1.Pod{},
}
}
// getUsedGPUMemory calculates the used memory of the device.
func (g *GPUDevice) getUsedGPUMemory() uint {
res := uint(0)
for _, pod := range g.PodMap {
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
continue
} else {
gpuRequest := GetGPUResourceOfPod(pod)
res += gpuRequest
}
}
return res
}
// GetGPUResourceOfPod returns the GPU resource required by the pod.
func GetGPUResourceOfPod(pod *v1.Pod) uint {
var mem uint
for _, container := range pod.Spec.Containers {
mem += getGPUResourceOfContainer(&container)
}
return mem
}
// getGPUResourceOfPod returns the GPU resource required by the container.
func getGPUResourceOfContainer(container *v1.Container) uint {
var mem uint
if val, ok := container.Resources.Limits[VolcanoGPUResource]; ok {
mem = uint(val.Value())
}
return mem
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"fmt"
v1 "k8s.io/api/core/v1"
clientcache "k8s.io/client-go/tools/cache"
)
// PodKey returns the string key of a pod.
func PodKey(pod *v1.Pod) TaskID {
key, err := clientcache.MetaNamespaceKeyFunc(pod)
if err != nil {
return TaskID(fmt.Sprintf("%v/%v", pod.Namespace, pod.Name))
}
return TaskID(key)
}
func getTaskStatus(pod *v1.Pod) TaskStatus {
switch pod.Status.Phase {
case v1.PodRunning:
if pod.DeletionTimestamp != nil {
return Releasing
}
return Running
case v1.PodPending:
if pod.DeletionTimestamp != nil {
return Releasing
}
if len(pod.Spec.NodeName) == 0 {
return Pending
}
return Bound
case v1.PodUnknown:
return Unknown
case v1.PodSucceeded:
return Succeeded
case v1.PodFailed:
return Failed
}
return Unknown
}
// AllocatedStatus checks whether the tasks has AllocatedStatus
func AllocatedStatus(status TaskStatus) bool {
switch status {
case Bound, Binding, Running, Allocated:
return true
default:
return false
}
}
// MergeErrors is used to merge multiple errors into single error
func MergeErrors(errs ...error) error {
msg := "errors: "
foundErr := false
i := 1
for _, e := range errs {
if e != nil {
if foundErr {
msg = fmt.Sprintf("%s, %d: ", msg, i)
} else {
msg = fmt.Sprintf("%s %d: ", msg, i)
}
msg = fmt.Sprintf("%s%v", msg, e)
foundErr = true
i++
}
}
if foundErr {
return fmt.Errorf("%s", msg)
}
return nil
}
// JobTerminated checks whether job was terminated.
func JobTerminated(job *JobInfo) bool {
return job.PodGroup == nil && len(job.Tasks) == 0
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helpers
import (
"math"
v1 "k8s.io/api/core/v1"
"volcano.sh/volcano/pkg/scheduler/api"
)
// Min is used to find the min of two resource types
func Min(l, r *api.Resource) *api.Resource {
res := &api.Resource{}
res.MilliCPU = math.Min(l.MilliCPU, r.MilliCPU)
res.Memory = math.Min(l.Memory, r.Memory)
if l.ScalarResources == nil || r.ScalarResources == nil {
return res
}
res.ScalarResources = map[v1.ResourceName]float64{}
for lName, lQuant := range l.ScalarResources {
res.ScalarResources[lName] = math.Min(lQuant, r.ScalarResources[lName])
}
return res
}
// Max returns the resource object with larger value in each dimension.
func Max(l, r *api.Resource) *api.Resource {
res := &api.Resource{}
res.MilliCPU = math.Max(l.MilliCPU, r.MilliCPU)
res.Memory = math.Max(l.Memory, r.Memory)
if l.ScalarResources == nil && r.ScalarResources == nil {
return res
}
res.ScalarResources = map[v1.ResourceName]float64{}
if l.ScalarResources != nil {
for lName, lQuant := range l.ScalarResources {
if lQuant > 0 {
res.ScalarResources[lName] = lQuant
}
}
}
if r.ScalarResources != nil {
for rName, rQuant := range r.ScalarResources {
if rQuant > 0 {
maxQuant := math.Max(rQuant, res.ScalarResources[rName])
res.ScalarResources[rName] = maxQuant
}
}
}
return res
}
// Share is used to determine the share
func Share(l, r float64) float64 {
var share float64
if r == 0 {
if l == 0 {
share = 0
} else {
share = 1
}
} else {
share = l / r
}
return share
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"encoding/json"
"errors"
"fmt"
"sort"
"strconv"
"strings"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
// DisruptionBudget define job min pod available and max pod unvailable value
type DisruptionBudget struct {
MinAvailable string
MaxUnavilable string
}
// NewDisruptionBudget create disruption budget for job
func NewDisruptionBudget(minAvailable, maxUnavilable string) *DisruptionBudget {
disruptionBudget := &DisruptionBudget{
MinAvailable: minAvailable,
MaxUnavilable: maxUnavilable,
}
return disruptionBudget
}
// Clone return a clone of DisruptionBudget
func (db *DisruptionBudget) Clone() *DisruptionBudget {
return &DisruptionBudget{
MinAvailable: db.MinAvailable,
MaxUnavilable: db.MaxUnavilable,
}
}
// JobWaitingTime is maximum waiting time that a job could stay Pending in service level agreement
// when job waits longer than waiting time, it should be inqueue at once, and cluster should reserve resources for it
const JobWaitingTime = "sla-waiting-time"
// TaskID is UID type for Task
type TaskID types.UID
// TransactionContext holds all the fields that needed by scheduling transaction
type TransactionContext struct {
NodeName string
Status TaskStatus
}
// Clone return a clone of TransactionContext
func (ctx *TransactionContext) Clone() *TransactionContext {
if ctx == nil {
return nil
}
clone := *ctx
return &clone
}
type TopologyInfo struct {
Policy string
ResMap map[int]v1.ResourceList // key: numa ID
}
func (info *TopologyInfo) Clone() *TopologyInfo {
copyInfo := &TopologyInfo{
Policy: info.Policy,
ResMap: make(map[int]v1.ResourceList),
}
for numaId, resList := range info.ResMap {
copyInfo.ResMap[numaId] = resList.DeepCopy()
}
return copyInfo
}
// TaskInfo will have all infos about the task
type TaskInfo struct {
UID TaskID
Job JobID
Name string
Namespace string
// Resreq is the resource that used when task running.
Resreq *Resource
// InitResreq is the resource that used to launch a task.
InitResreq *Resource
TransactionContext
// LastTransaction holds the context of last scheduling transaction
LastTransaction *TransactionContext
Priority int32
VolumeReady bool
Preemptable bool
BestEffort bool
// RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
// we only support empty value or * value for this version and we will support specify revocable zone name for futrue release
// empty value means workload can not use revocable node
// * value means workload can use all the revocable node for during node active revocable time.
RevocableZone string
NumaInfo *TopologyInfo
PodVolumes *volumescheduling.PodVolumes
Pod *v1.Pod
}
func getJobID(pod *v1.Pod) JobID {
if gn, found := pod.Annotations[v1beta1.KubeGroupNameAnnotationKey]; found && len(gn) != 0 {
// Make sure Pod and PodGroup belong to the same namespace.
jobID := fmt.Sprintf("%s/%s", pod.Namespace, gn)
return JobID(jobID)
}
return ""
}
func getTaskID(pod *v1.Pod) TaskID {
if ts, found := pod.Annotations[batch.TaskSpecKey]; found && len(ts) != 0 {
return TaskID(ts)
}
return ""
}
const TaskPriorityAnnotation = "volcano.sh/task-priority"
// NewTaskInfo creates new taskInfo object for a Pod
func NewTaskInfo(pod *v1.Pod) *TaskInfo {
initResReq := GetPodResourceRequest(pod)
resReq := initResReq
bestEffort := initResReq.IsEmpty()
preemptable := GetPodPreemptable(pod)
revocableZone := GetPodRevocableZone(pod)
topologyInfo := GetPodTopologyInfo(pod)
jobID := getJobID(pod)
ti := &TaskInfo{
UID: TaskID(pod.UID),
Job: jobID,
Name: pod.Name,
Namespace: pod.Namespace,
Priority: 1,
Pod: pod,
Resreq: resReq,
InitResreq: initResReq,
Preemptable: preemptable,
BestEffort: bestEffort,
RevocableZone: revocableZone,
NumaInfo: topologyInfo,
TransactionContext: TransactionContext{
NodeName: pod.Spec.NodeName,
Status: getTaskStatus(pod),
},
}
if pod.Spec.Priority != nil {
ti.Priority = *pod.Spec.Priority
}
if taskPriority, ok := pod.Annotations[TaskPriorityAnnotation]; ok {
if priority, err := strconv.ParseInt(taskPriority, 10, 32); err == nil {
ti.Priority = int32(priority)
}
}
return ti
}
// GetTransactionContext get transaction context of a task
func (ti *TaskInfo) GetTransactionContext() TransactionContext {
return ti.TransactionContext
}
// GenerateLastTxContext generate and set context of last transaction for a task
func (ti *TaskInfo) GenerateLastTxContext() {
ctx := ti.GetTransactionContext()
ti.LastTransaction = &ctx
}
// ClearLastTxContext clear context of last transaction for a task
func (ti *TaskInfo) ClearLastTxContext() {
ti.LastTransaction = nil
}
func (ti *TaskInfo) SetPodResourceDecision() error {
if ti.NumaInfo == nil || len(ti.NumaInfo.ResMap) == 0 {
return nil
}
klog.V(4).Infof("%v/%v resource decision: %v", ti.Namespace, ti.Name, ti.NumaInfo.ResMap)
decision := PodResourceDecision{
NUMAResources: ti.NumaInfo.ResMap,
}
layout, err := json.Marshal(&decision)
if err != nil {
return err
}
metav1.SetMetaDataAnnotation(&ti.Pod.ObjectMeta, topologyDecisionAnnotation, string(layout[:]))
return nil
}
func (ti *TaskInfo) UnsetPodResourceDecision() {
delete(ti.Pod.Annotations, topologyDecisionAnnotation)
}
// Clone is used for cloning a task
func (ti *TaskInfo) Clone() *TaskInfo {
return &TaskInfo{
UID: ti.UID,
Job: ti.Job,
Name: ti.Name,
Namespace: ti.Namespace,
Priority: ti.Priority,
PodVolumes: ti.PodVolumes,
Pod: ti.Pod,
Resreq: ti.Resreq.Clone(),
InitResreq: ti.InitResreq.Clone(),
VolumeReady: ti.VolumeReady,
Preemptable: ti.Preemptable,
BestEffort: ti.BestEffort,
RevocableZone: ti.RevocableZone,
NumaInfo: ti.NumaInfo.Clone(),
TransactionContext: TransactionContext{
NodeName: ti.NodeName,
Status: ti.Status,
},
LastTransaction: ti.LastTransaction.Clone(),
}
}
func (ti *TaskInfo) GetTaskSpecKey() TaskID {
if ti.Pod == nil {
return ""
}
return getTaskID(ti.Pod)
}
// String returns the taskInfo details in a string
func (ti TaskInfo) String() string {
if ti.NumaInfo == nil {
return fmt.Sprintf("Task (%v:%v/%v): job %v, status %v, pri %v"+
"resreq %v, preemptable %v, revocableZone %v",
ti.UID, ti.Namespace, ti.Name, ti.Job, ti.Status, ti.Priority,
ti.Resreq, ti.Preemptable, ti.RevocableZone)
}
return fmt.Sprintf("Task (%v:%v/%v): job %v, status %v, pri %v"+
"resreq %v, preemptable %v, revocableZone %v, numaInfo %v",
ti.UID, ti.Namespace, ti.Name, ti.Job, ti.Status, ti.Priority,
ti.Resreq, ti.Preemptable, ti.RevocableZone, *ti.NumaInfo)
}
// JobID is the type of JobInfo's ID.
type JobID types.UID
type tasksMap map[TaskID]*TaskInfo
// NodeResourceMap stores resource in a node
type NodeResourceMap map[string]*Resource
// JobInfo will have all info of a Job
type JobInfo struct {
UID JobID
Name string
Namespace string
Queue QueueID
Priority int32
MinAvailable int32
WaitingTime *time.Duration
JobFitErrors string
NodesFitErrors map[TaskID]*FitErrors
// All tasks of the Job.
TaskStatusIndex map[TaskStatus]tasksMap
Tasks tasksMap
TaskMinAvailable map[TaskID]int32
TaskMinAvailableTotal int32
Allocated *Resource
TotalRequest *Resource
CreationTimestamp metav1.Time
PodGroup *PodGroup
ScheduleStartTimestamp metav1.Time
Preemptable bool
// RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
// we only support empty value or * value for this version and we will support specify revocable zone name for futrue release
// empty value means workload can not use revocable node
// * value means workload can use all the revocable node for during node active revocable time.
RevocableZone string
Budget *DisruptionBudget
}
// NewJobInfo creates a new jobInfo for set of tasks
func NewJobInfo(uid JobID, tasks ...*TaskInfo) *JobInfo {
job := &JobInfo{
UID: uid,
MinAvailable: 0,
NodesFitErrors: make(map[TaskID]*FitErrors),
Allocated: EmptyResource(),
TotalRequest: EmptyResource(),
TaskStatusIndex: map[TaskStatus]tasksMap{},
Tasks: tasksMap{},
TaskMinAvailable: map[TaskID]int32{},
}
for _, task := range tasks {
job.AddTaskInfo(task)
}
return job
}
// UnsetPodGroup removes podGroup details from a job
func (ji *JobInfo) UnsetPodGroup() {
ji.PodGroup = nil
}
// SetPodGroup sets podGroup details to a job
func (ji *JobInfo) SetPodGroup(pg *PodGroup) {
ji.Name = pg.Name
ji.Namespace = pg.Namespace
ji.MinAvailable = pg.Spec.MinMember
ji.Queue = QueueID(pg.Spec.Queue)
ji.CreationTimestamp = pg.GetCreationTimestamp()
var err error
ji.WaitingTime, err = ji.extractWaitingTime(pg)
if err != nil {
klog.Warningf("Error occurs in parsing waiting time for job <%s/%s>, err: %s.",
pg.Namespace, pg.Name, err.Error())
ji.WaitingTime = nil
}
ji.Preemptable = ji.extractPreemptable(pg)
ji.RevocableZone = ji.extractRevocableZone(pg)
ji.Budget = ji.extractBudget(pg)
taskMinAvailableTotal := int32(0)
for task, member := range pg.Spec.MinTaskMember {
ji.TaskMinAvailable[TaskID(task)] = member
taskMinAvailableTotal += member
}
ji.TaskMinAvailableTotal = taskMinAvailableTotal
ji.PodGroup = pg
}
// extractWaitingTime reads sla waiting time for job from podgroup annotations
// TODO: should also read from given field in volcano job spec
func (ji *JobInfo) extractWaitingTime(pg *PodGroup) (*time.Duration, error) {
if _, exist := pg.Annotations[JobWaitingTime]; !exist {
return nil, nil
}
jobWaitingTime, err := time.ParseDuration(pg.Annotations[JobWaitingTime])
if err != nil {
return nil, err
}
if jobWaitingTime <= 0 {
return nil, errors.New("invalid sla waiting time")
}
return &jobWaitingTime, nil
}
// extractPreemptable return volcano.sh/preemptable value for job
func (ji *JobInfo) extractPreemptable(pg *PodGroup) bool {
// check annotaion first
if len(pg.Annotations) > 0 {
if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
b, err := strconv.ParseBool(value)
if err != nil {
klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
return false
}
return b
}
}
// it annotation does not exit, check label
if len(pg.Labels) > 0 {
if value, found := pg.Labels[v1beta1.PodPreemptable]; found {
b, err := strconv.ParseBool(value)
if err != nil {
klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
return false
}
return b
}
}
return false
}
// extractRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
func (ji *JobInfo) extractRevocableZone(pg *PodGroup) string {
// check annotation first
if len(pg.Annotations) > 0 {
if value, found := pg.Annotations[v1beta1.RevocableZone]; found {
if value != "*" {
return ""
}
return value
}
if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
if b, err := strconv.ParseBool(value); err == nil && b {
return "*"
}
}
}
return ""
}
// extractBudget return budget value for job
func (ji *JobInfo) extractBudget(pg *PodGroup) *DisruptionBudget {
if len(pg.Annotations) > 0 {
if value, found := pg.Annotations[v1beta1.JDBMinAvailable]; found {
return NewDisruptionBudget(value, "")
} else if value, found := pg.Annotations[v1beta1.JDBMaxUnavailable]; found {
return NewDisruptionBudget("", value)
}
}
return NewDisruptionBudget("", "")
}
// GetMinResources return the min resources of podgroup.
func (ji *JobInfo) GetMinResources() *Resource {
if ji.PodGroup.Spec.MinResources == nil {
return EmptyResource()
}
return NewResource(*ji.PodGroup.Spec.MinResources)
}
func (ji *JobInfo) addTaskIndex(ti *TaskInfo) {
if _, found := ji.TaskStatusIndex[ti.Status]; !found {
ji.TaskStatusIndex[ti.Status] = tasksMap{}
}
ji.TaskStatusIndex[ti.Status][ti.UID] = ti
}
// AddTaskInfo is used to add a task to a job
func (ji *JobInfo) AddTaskInfo(ti *TaskInfo) {
ji.Tasks[ti.UID] = ti
ji.addTaskIndex(ti)
ji.TotalRequest.Add(ti.Resreq)
if AllocatedStatus(ti.Status) {
ji.Allocated.Add(ti.Resreq)
}
}
// UpdateTaskStatus is used to update task's status in a job.
// If error occurs both task and job are guaranteed to be in the original state.
func (ji *JobInfo) UpdateTaskStatus(task *TaskInfo, status TaskStatus) error {
if err := validateStatusUpdate(task.Status, status); err != nil {
return err
}
// First remove the task (if exist) from the task list.
if _, found := ji.Tasks[task.UID]; found {
if err := ji.DeleteTaskInfo(task); err != nil {
return err
}
}
// Update task's status to the target status once task addition is guaranteed to succeed.
task.Status = status
ji.AddTaskInfo(task)
return nil
}
func (ji *JobInfo) deleteTaskIndex(ti *TaskInfo) {
if tasks, found := ji.TaskStatusIndex[ti.Status]; found {
delete(tasks, ti.UID)
if len(tasks) == 0 {
delete(ji.TaskStatusIndex, ti.Status)
}
}
}
// DeleteTaskInfo is used to delete a task from a job
func (ji *JobInfo) DeleteTaskInfo(ti *TaskInfo) error {
if task, found := ji.Tasks[ti.UID]; found {
ji.TotalRequest.Sub(task.Resreq)
if AllocatedStatus(task.Status) {
ji.Allocated.Sub(task.Resreq)
}
delete(ji.Tasks, task.UID)
ji.deleteTaskIndex(task)
return nil
}
return fmt.Errorf("failed to find task <%v/%v> in job <%v/%v>",
ti.Namespace, ti.Name, ji.Namespace, ji.Name)
}
// Clone is used to clone a jobInfo object
func (ji *JobInfo) Clone() *JobInfo {
info := &JobInfo{
UID: ji.UID,
Name: ji.Name,
Namespace: ji.Namespace,
Queue: ji.Queue,
Priority: ji.Priority,
MinAvailable: ji.MinAvailable,
WaitingTime: ji.WaitingTime,
JobFitErrors: ji.JobFitErrors,
NodesFitErrors: make(map[TaskID]*FitErrors),
Allocated: EmptyResource(),
TotalRequest: EmptyResource(),
PodGroup: ji.PodGroup.Clone(),
TaskStatusIndex: map[TaskStatus]tasksMap{},
TaskMinAvailable: ji.TaskMinAvailable,
TaskMinAvailableTotal: ji.TaskMinAvailableTotal,
Tasks: tasksMap{},
Preemptable: ji.Preemptable,
RevocableZone: ji.RevocableZone,
Budget: ji.Budget.Clone(),
}
ji.CreationTimestamp.DeepCopyInto(&info.CreationTimestamp)
for _, task := range ji.Tasks {
info.AddTaskInfo(task.Clone())
}
return info
}
// String returns a jobInfo object in string format
func (ji JobInfo) String() string {
res := ""
i := 0
for _, task := range ji.Tasks {
res += fmt.Sprintf("\n\t %d: %v", i, task)
i++
}
return fmt.Sprintf("Job (%v): namespace %v (%v), name %v, minAvailable %d, podGroup %+v, preemptable %+v, revocableZone %+v, minAvailable %+v, maxAvailable %+v",
ji.UID, ji.Namespace, ji.Queue, ji.Name, ji.MinAvailable, ji.PodGroup, ji.Preemptable, ji.RevocableZone, ji.Budget.MinAvailable, ji.Budget.MaxUnavilable) + res
}
// FitError returns detailed information on why a job's task failed to fit on
// each available node
func (ji *JobInfo) FitError() string {
sortReasonsHistogram := func(reasons map[string]int) []string {
reasonStrings := []string{}
for k, v := range reasons {
reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
}
sort.Strings(reasonStrings)
return reasonStrings
}
// Stat histogram for all tasks of the job
reasons := make(map[string]int)
for status, taskMap := range ji.TaskStatusIndex {
reasons[status.String()] += len(taskMap)
}
reasons["minAvailable"] = int(ji.MinAvailable)
reasonMsg := fmt.Sprintf("%v, %v", scheduling.PodGroupNotReady, strings.Join(sortReasonsHistogram(reasons), ", "))
// Stat histogram for pending tasks only
reasons = make(map[string]int)
for uid := range ji.TaskStatusIndex[Pending] {
reason, _ := ji.TaskSchedulingReason(uid)
reasons[reason]++
}
if len(reasons) > 0 {
reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", "))
}
return reasonMsg
}
// TaskSchedulingReason get detailed reason and message of the given task
// It returns detailed reason and message for tasks based on last scheduling transaction.
func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string) {
taskInfo, exists := ji.Tasks[tid]
if !exists {
return "", ""
}
// Get detailed scheduling reason based on LastTransaction
ctx := taskInfo.GetTransactionContext()
if taskInfo.LastTransaction != nil {
ctx = *taskInfo.LastTransaction
}
msg = ji.JobFitErrors
switch status := ctx.Status; status {
case Allocated, Pipelined:
// Pod is schedulable
msg = fmt.Sprintf("Pod %s/%s can possibly be assigned to %s", taskInfo.Namespace, taskInfo.Name, ctx.NodeName)
if status == Pipelined {
msg += " once resource is released"
}
return PodReasonSchedulable, msg
case Pending:
if fe := ji.NodesFitErrors[tid]; fe != nil {
// Pod is not schedulable
return PodReasonUnschedulable, fe.Error()
}
// Pod is not scheduled yet
return PodReasonUndetermined, msg
default:
return status.String(), msg
}
}
// ReadyTaskNum returns the number of tasks that are ready or that is best-effort.
func (ji *JobInfo) ReadyTaskNum() int32 {
occupied := 0
occupied += len(ji.TaskStatusIndex[Bound])
occupied += len(ji.TaskStatusIndex[Binding])
occupied += len(ji.TaskStatusIndex[Running])
occupied += len(ji.TaskStatusIndex[Allocated])
occupied += len(ji.TaskStatusIndex[Succeeded])
if tasks, found := ji.TaskStatusIndex[Pending]; found {
for _, task := range tasks {
if task.BestEffort {
occupied++
}
}
}
return int32(occupied)
}
// WaitingTaskNum returns the number of tasks that are pipelined.
func (ji *JobInfo) WaitingTaskNum() int32 {
return int32(len(ji.TaskStatusIndex[Pipelined]))
}
// CheckTaskMinAvailable returns whether each task of job is valid.
func (ji *JobInfo) CheckTaskMinAvailable() bool {
// if job minAvailable is less than sumof(task minAvailable), skip this check
if ji.MinAvailable < ji.TaskMinAvailableTotal {
return true
}
actual := map[TaskID]int32{}
for status, tasks := range ji.TaskStatusIndex {
if AllocatedStatus(status) ||
status == Succeeded ||
status == Pipelined ||
status == Pending {
for _, task := range tasks {
actual[getTaskID(task.Pod)]++
}
}
}
klog.V(4).Infof("job %s/%s actual: %+v, ji.TaskMinAvailable: %+v", ji.Name, ji.Namespace, actual, ji.TaskMinAvailable)
for task, minAvailable := range ji.TaskMinAvailable {
if act, ok := actual[task]; !ok || act < minAvailable {
return false
}
}
return true
}
// CheckTaskMinAvailableReady return ready pods meet task minavaliable.
func (ji *JobInfo) CheckTaskMinAvailableReady() bool {
if ji.MinAvailable < ji.TaskMinAvailableTotal {
return true
}
occupiedMap := map[TaskID]int32{}
for status, tasks := range ji.TaskStatusIndex {
if AllocatedStatus(status) ||
status == Succeeded {
for _, task := range tasks {
occupiedMap[getTaskID(task.Pod)] += 1
}
continue
}
if status == Pending {
for _, task := range tasks {
if task.InitResreq.IsEmpty() {
occupiedMap[getTaskID(task.Pod)] += 1
}
}
}
}
for taskId, minNum := range ji.TaskMinAvailable {
if occupiedMap[taskId] < minNum {
klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min avaliable", ji.Namespace, ji.Name, taskId, occupiedMap[taskId])
return false
}
}
return true
}
// CheckTaskMinAvailableReady return ready pods meet task minavaliable.
func (ji *JobInfo) CheckTaskMinAvailablePipelined() bool {
if ji.MinAvailable < ji.TaskMinAvailableTotal {
return true
}
occupiedMap := map[TaskID]int32{}
for status, tasks := range ji.TaskStatusIndex {
if AllocatedStatus(status) ||
status == Succeeded ||
status == Pipelined {
for _, task := range tasks {
occupiedMap[getTaskID(task.Pod)] += 1
}
continue
}
if status == Pending {
for _, task := range tasks {
if task.InitResreq.IsEmpty() {
occupiedMap[getTaskID(task.Pod)] += 1
}
}
}
}
for taskId, minNum := range ji.TaskMinAvailable {
if occupiedMap[taskId] < minNum {
klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min avaliable", ji.Namespace, ji.Name, taskId, occupiedMap[taskId])
return false
}
}
return true
}
// ValidTaskNum returns the number of tasks that are valid.
func (ji *JobInfo) ValidTaskNum() int32 {
occupied := 0
for status, tasks := range ji.TaskStatusIndex {
if AllocatedStatus(status) ||
status == Succeeded ||
status == Pipelined ||
status == Pending {
occupied += len(tasks)
}
}
return int32(occupied)
}
// Ready returns whether job is ready for run
func (ji *JobInfo) Ready() bool {
occupied := ji.ReadyTaskNum()
return occupied >= ji.MinAvailable
}
// IsPending returns whether job is in pending status
func (ji *JobInfo) IsPending() bool {
if ji.PodGroup == nil || ji.PodGroup.Status.Phase == scheduling.PodGroupPending || ji.PodGroup.Status.Phase == "" {
return true
}
return false
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
)
// NamespaceName is name of namespace
type NamespaceName string
const (
// NamespaceWeightKey is the key in ResourceQuota.spec.hard indicating the weight of this namespace
NamespaceWeightKey = "volcano.sh/namespace.weight"
// DefaultNamespaceWeight is the default weight of namespace
DefaultNamespaceWeight = 1
)
// NamespaceInfo records information of namespace
type NamespaceInfo struct {
// Name is the name of this namespace
Name NamespaceName
// Weight is the highest weight among many ResourceQuota.
Weight int64
}
// GetWeight returns weight of a namespace, any invalid case would get default value
func (n *NamespaceInfo) GetWeight() int64 {
if n == nil || n.Weight == 0 {
return DefaultNamespaceWeight
}
return n.Weight
}
type quotaItem struct {
name string
weight int64
}
func quotaItemKeyFunc(obj interface{}) (string, error) {
item, ok := obj.(*quotaItem)
if !ok {
return "", fmt.Errorf("obj with type %T could not parse", obj)
}
return item.name, nil
}
// for big root heap
func quotaItemLessFunc(a interface{}, b interface{}) bool {
A := a.(*quotaItem)
B := b.(*quotaItem)
return A.weight > B.weight
}
// NamespaceCollection will record all details about namespace
type NamespaceCollection struct {
Name string
quotaWeight *cache.Heap
}
// NewNamespaceCollection creates new NamespaceCollection object to record all information about a namespace
func NewNamespaceCollection(name string) *NamespaceCollection {
n := &NamespaceCollection{
Name: name,
quotaWeight: cache.NewHeap(quotaItemKeyFunc, quotaItemLessFunc),
}
return n
}
func (n *NamespaceCollection) deleteWeight(q *quotaItem) {
n.quotaWeight.Delete(q)
}
func (n *NamespaceCollection) updateWeight(q *quotaItem) {
n.quotaWeight.Update(q)
}
func itemFromQuota(quota *v1.ResourceQuota) *quotaItem {
var weight int64 = DefaultNamespaceWeight
quotaWeight, ok := quota.Spec.Hard[NamespaceWeightKey]
if ok {
weight = quotaWeight.Value()
}
item := "aItem{
name: quota.Name,
weight: weight,
}
return item
}
// Update modify the registered information according quota object
func (n *NamespaceCollection) Update(quota *v1.ResourceQuota) {
n.updateWeight(itemFromQuota(quota))
}
// Delete remove the registered information according quota object
func (n *NamespaceCollection) Delete(quota *v1.ResourceQuota) {
n.deleteWeight(itemFromQuota(quota))
}
// Snapshot will clone a NamespaceInfo without Heap according NamespaceCollection
func (n *NamespaceCollection) Snapshot() *NamespaceInfo {
var weight int64 = DefaultNamespaceWeight
obj, err := n.quotaWeight.Pop()
if err != nil {
klog.Warningf("namespace %s, quota weight meets error %v when pop", n.Name, err)
} else {
item := obj.(*quotaItem)
weight = item.weight
n.quotaWeight.Add(item)
}
return &NamespaceInfo{
Name: NamespaceName(n.Name),
Weight: weight,
}
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"fmt"
"strconv"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
type AllocateFailError struct {
Reason string
}
func (o *AllocateFailError) Error() string {
return o.Reason
}
// NodeInfo is node level aggregated information.
type NodeInfo struct {
Name string
Node *v1.Node
// The state of node
State NodeState
// The releasing resource on that node
Releasing *Resource
// The pipelined resource on that node
Pipelined *Resource
// The idle resource on that node
Idle *Resource
// The used resource on that node, including running and terminating
// pods
Used *Resource
Allocatable *Resource
Capability *Resource
Tasks map[TaskID]*TaskInfo
NumaInfo *NumatopoInfo
NumaChgFlag NumaChgFlag
NumaSchedulerInfo *NumatopoInfo
RevocableZone string
// Used to store custom information
Others map[string]interface{}
GPUDevices map[int]*GPUDevice
// enable node resource oversubscription
OversubscriptionNode bool
// OfflineJobEvicting true means node resource usage too high then dispatched pod can not use oversubscription resource
OfflineJobEvicting bool
// Resource Oversubscription feature: the Oversubscription Resource reported in annotation
OversubscriptionResource *Resource
}
// FutureIdle returns resources that will be idle in the future:
//
// That is current idle resources plus released resources minus pipelined resources.
func (ni *NodeInfo) FutureIdle() *Resource {
return ni.Idle.Clone().Add(ni.Releasing).Sub(ni.Pipelined)
}
// GetNodeAllocatable return node Allocatable without OversubscriptionResource resource
func (ni *NodeInfo) GetNodeAllocatable() *Resource {
return NewResource(ni.Node.Status.Allocatable)
}
// NodeState defines the current state of node.
type NodeState struct {
Phase NodePhase
Reason string
}
// NewNodeInfo is used to create new nodeInfo object
func NewNodeInfo(node *v1.Node) *NodeInfo {
nodeInfo := &NodeInfo{
Releasing: EmptyResource(),
Pipelined: EmptyResource(),
Idle: EmptyResource(),
Used: EmptyResource(),
Allocatable: EmptyResource(),
Capability: EmptyResource(),
OversubscriptionResource: EmptyResource(),
Tasks: make(map[TaskID]*TaskInfo),
GPUDevices: make(map[int]*GPUDevice),
}
nodeInfo.setOversubscription(node)
if node != nil {
nodeInfo.Name = node.Name
nodeInfo.Node = node
nodeInfo.Idle = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
nodeInfo.Allocatable = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
nodeInfo.Capability = NewResource(node.Status.Capacity).Add(nodeInfo.OversubscriptionResource)
}
nodeInfo.setNodeGPUInfo(node)
nodeInfo.setNodeState(node)
nodeInfo.setRevocableZone(node)
return nodeInfo
}
// RefreshNumaSchedulerInfoByCrd used to update scheduler numa information based the CRD numatopo
func (ni *NodeInfo) RefreshNumaSchedulerInfoByCrd() {
if ni.NumaInfo == nil {
ni.NumaSchedulerInfo = nil
return
}
tmp := ni.NumaInfo.DeepCopy()
if ni.NumaChgFlag == NumaInfoMoreFlag {
ni.NumaSchedulerInfo = tmp
} else if ni.NumaChgFlag == NumaInfoLessFlag {
numaResMap := ni.NumaSchedulerInfo.NumaResMap
for resName, resInfo := range tmp.NumaResMap {
klog.V(5).Infof("resource %s Allocatable : current %v new %v on node %s",
resName, numaResMap[resName], resInfo, ni.Name)
if numaResMap[resName].Allocatable.Size() >= resInfo.Allocatable.Size() {
numaResMap[resName].Allocatable = resInfo.Allocatable.Clone()
numaResMap[resName].Capacity = resInfo.Capacity
}
}
}
ni.NumaChgFlag = NumaInfoResetFlag
}
// Clone used to clone nodeInfo Object
func (ni *NodeInfo) Clone() *NodeInfo {
res := NewNodeInfo(ni.Node)
for _, p := range ni.Tasks {
res.AddTask(p)
}
if ni.NumaInfo != nil {
res.NumaInfo = ni.NumaInfo.DeepCopy()
}
if ni.NumaSchedulerInfo != nil {
res.NumaSchedulerInfo = ni.NumaSchedulerInfo.DeepCopy()
klog.V(5).Infof("node[%s]", ni.Name)
for resName, resInfo := range res.NumaSchedulerInfo.NumaResMap {
klog.V(5).Infof("current resource %s : %v", resName, resInfo)
}
klog.V(5).Infof("current Policies : %v", res.NumaSchedulerInfo.Policies)
}
res.Others = ni.Others
return res
}
// Ready returns whether node is ready for scheduling
func (ni *NodeInfo) Ready() bool {
return ni.State.Phase == Ready
}
func (ni *NodeInfo) setRevocableZone(node *v1.Node) {
if node == nil {
klog.Warningf("the argument node is null.")
return
}
revocableZone := ""
if len(node.Labels) > 0 {
if value, found := node.Labels[v1beta1.RevocableZone]; found {
revocableZone = value
}
}
ni.RevocableZone = revocableZone
}
// Check node if enable Oversubscription and set Oversubscription resources
// Only support oversubscription cpu and memory resource for this version
func (ni *NodeInfo) setOversubscription(node *v1.Node) {
if node == nil {
return
}
ni.OversubscriptionNode = false
ni.OfflineJobEvicting = false
if len(node.Labels) > 0 {
if value, found := node.Labels[OversubscriptionNode]; found {
b, err := strconv.ParseBool(value)
if err == nil {
ni.OversubscriptionNode = b
} else {
ni.OversubscriptionNode = false
}
klog.V(5).Infof("Set node %s Oversubscription to %v", node.Name, ni.OversubscriptionNode)
}
}
if len(node.Annotations) > 0 {
if value, found := node.Annotations[OfflineJobEvicting]; found {
b, err := strconv.ParseBool(value)
if err == nil {
ni.OfflineJobEvicting = b
} else {
ni.OfflineJobEvicting = false
}
klog.V(5).Infof("Set node %s OfflineJobEvicting to %v", node.Name, ni.OfflineJobEvicting)
}
if value, found := node.Annotations[OversubscriptionCPU]; found {
ni.OversubscriptionResource.MilliCPU, _ = strconv.ParseFloat(value, 64)
klog.V(5).Infof("Set node %s Oversubscription CPU to %v", node.Name, ni.OversubscriptionResource.MilliCPU)
}
if value, found := node.Annotations[OversubscriptionMemory]; found {
ni.OversubscriptionResource.Memory, _ = strconv.ParseFloat(value, 64)
klog.V(5).Infof("Set node %s Oversubscription Memory to %v", node.Name, ni.OversubscriptionResource.Memory)
}
}
}
func (ni *NodeInfo) setNodeState(node *v1.Node) {
// If node is nil, the node is un-initialized in cache
if node == nil {
ni.State = NodeState{
Phase: NotReady,
Reason: "UnInitialized",
}
return
}
// set NodeState according to resources
if !ni.Used.LessEqual(ni.Allocatable, Zero) {
ni.State = NodeState{
Phase: NotReady,
Reason: "OutOfSync",
}
return
}
// If node not ready, e.g. power off
for _, cond := range node.Status.Conditions {
if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue {
ni.State = NodeState{
Phase: NotReady,
Reason: "NotReady",
}
klog.Warningf("set the node %s status to %s.", node.Name, NotReady.String())
return
}
}
// Node is ready (ignore node conditions because of taint/toleration)
ni.State = NodeState{
Phase: Ready,
Reason: "",
}
klog.V(4).Infof("set the node %s status to %s.", node.Name, Ready.String())
}
func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) {
if node == nil {
return
}
memory, ok := node.Status.Capacity[VolcanoGPUResource]
if !ok {
return
}
totalMemory := memory.Value()
res, ok := node.Status.Capacity[VolcanoGPUNumber]
if !ok {
return
}
gpuNumber := res.Value()
if gpuNumber == 0 {
klog.Warningf("invalid %s=%s", VolcanoGPUNumber, res.String())
return
}
memoryPerCard := uint(totalMemory / gpuNumber)
for i := 0; i < int(gpuNumber); i++ {
ni.GPUDevices[i] = NewGPUDevice(i, memoryPerCard)
}
}
// SetNode sets kubernetes node object to nodeInfo object
func (ni *NodeInfo) SetNode(node *v1.Node) {
ni.setNodeState(node)
if !ni.Ready() {
klog.Warningf("Failed to set node info for %s, phase: %s, reason: %s",
ni.Name, ni.State.Phase, ni.State.Reason)
return
}
// Dry run, make sure all fields other than `State` are in the original state.
copy := ni.Clone()
copy.setNode(node)
copy.setNodeState(node)
if !copy.Ready() {
klog.Warningf("SetNode makes node %s not ready, phase: %s, reason: %s",
copy.Name, copy.State.Phase, copy.State.Reason)
// Set state of node to !Ready, left other fields untouched
ni.State = copy.State
return
}
ni.setNode(node)
}
// setNode sets kubernetes node object to nodeInfo object without assertion
func (ni *NodeInfo) setNode(node *v1.Node) {
ni.setOversubscription(node)
ni.setNodeGPUInfo(node)
ni.setRevocableZone(node)
ni.Name = node.Name
ni.Node = node
ni.Allocatable = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
ni.Capability = NewResource(node.Status.Capacity).Add(ni.OversubscriptionResource)
ni.Releasing = EmptyResource()
ni.Pipelined = EmptyResource()
ni.Idle = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
ni.Used = EmptyResource()
for _, ti := range ni.Tasks {
switch ti.Status {
case Releasing:
ni.Idle.sub(ti.Resreq) // sub without assertion
ni.Releasing.Add(ti.Resreq)
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
case Pipelined:
ni.Pipelined.Add(ti.Resreq)
default:
ni.Idle.sub(ti.Resreq) // sub without assertion
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
}
}
}
func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) error {
if ti.Resreq.LessEqual(ni.Idle, Zero) {
ni.Idle.Sub(ti.Resreq)
return nil
}
return &AllocateFailError{Reason: fmt.Sprintf(
"cannot allocate resource, <%s> idle: %s <%s/%s> req: %s",
ni.Name, ni.Idle.String(), ti.Namespace, ti.Name, ti.Resreq.String(),
)}
}
// AddTask is used to add a task in nodeInfo object
//
// If error occurs both task and node are guaranteed to be in the original state.
func (ni *NodeInfo) AddTask(task *TaskInfo) error {
if len(task.NodeName) > 0 && len(ni.Name) > 0 && task.NodeName != ni.Name {
return fmt.Errorf("task <%v/%v> already on different node <%v>",
task.Namespace, task.Name, task.NodeName)
}
key := PodKey(task.Pod)
if _, found := ni.Tasks[key]; found {
return fmt.Errorf("task <%v/%v> already on node <%v>",
task.Namespace, task.Name, ni.Name)
}
// Node will hold a copy of task to make sure the status
// change will not impact resource in node.
ti := task.Clone()
if ni.Node != nil {
switch ti.Status {
case Releasing:
if err := ni.allocateIdleResource(ti); err != nil {
return err
}
ni.Releasing.Add(ti.Resreq)
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
case Pipelined:
ni.Pipelined.Add(ti.Resreq)
default:
if err := ni.allocateIdleResource(ti); err != nil {
return err
}
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
}
}
if ni.NumaInfo != nil {
ni.NumaInfo.AddTask(ti)
}
// Update task node name upon successful task addition.
task.NodeName = ni.Name
ti.NodeName = ni.Name
ni.Tasks[key] = ti
return nil
}
// RemoveTask used to remove a task from nodeInfo object.
//
// If error occurs both task and node are guaranteed to be in the original state.
func (ni *NodeInfo) RemoveTask(ti *TaskInfo) error {
key := PodKey(ti.Pod)
task, found := ni.Tasks[key]
if !found {
klog.Warningf("failed to find task <%v/%v> on host <%v>",
ti.Namespace, ti.Name, ni.Name)
return nil
}
if ni.Node != nil {
switch task.Status {
case Releasing:
ni.Releasing.Sub(task.Resreq)
ni.Idle.Add(task.Resreq)
ni.Used.Sub(task.Resreq)
ni.SubGPUResource(ti.Pod)
case Pipelined:
ni.Pipelined.Sub(task.Resreq)
default:
ni.Idle.Add(task.Resreq)
ni.Used.Sub(task.Resreq)
ni.SubGPUResource(ti.Pod)
}
}
if ni.NumaInfo != nil {
ni.NumaInfo.RemoveTask(ti)
}
delete(ni.Tasks, key)
return nil
}
// UpdateTask is used to update a task in nodeInfo object.
//
// If error occurs both task and node are guaranteed to be in the original state.
func (ni *NodeInfo) UpdateTask(ti *TaskInfo) error {
if err := ni.RemoveTask(ti); err != nil {
return err
}
if err := ni.AddTask(ti); err != nil {
// This should never happen if task removal was successful,
// because only possible error during task addition is when task is still on a node.
klog.Fatalf("Failed to add Task <%s,%s> to Node <%s> during task update",
ti.Namespace, ti.Name, ni.Name)
}
return nil
}
// String returns nodeInfo details in string format
func (ni NodeInfo) String() string {
tasks := ""
i := 0
for _, task := range ni.Tasks {
tasks += fmt.Sprintf("\n\t %d: %v", i, task)
i++
}
return fmt.Sprintf("Node (%s): allocatable<%v> idle <%v>, used <%v>, releasing <%v>, oversubscribution <%v>, "+
"state <phase %s, reaseon %s>, oversubscributionNode <%v>, offlineJobEvicting <%v>,taints <%v>%s",
ni.Name, ni.Allocatable, ni.Idle, ni.Used, ni.Releasing, ni.OversubscriptionResource, ni.State.Phase, ni.State.Reason, ni.OversubscriptionNode, ni.OfflineJobEvicting, ni.Node.Spec.Taints, tasks)
}
// Pods returns all pods running in that node
func (ni *NodeInfo) Pods() (pods []*v1.Pod) {
for _, t := range ni.Tasks {
pods = append(pods, t.Pod)
}
return
}
// GetDevicesIdleGPUMemory returns all the idle GPU memory by gpu card.
func (ni *NodeInfo) GetDevicesIdleGPUMemory() map[int]uint {
devicesAllGPUMemory := ni.getDevicesAllGPUMemory()
devicesUsedGPUMemory := ni.getDevicesUsedGPUMemory()
res := map[int]uint{}
for id, allMemory := range devicesAllGPUMemory {
if usedMemory, found := devicesUsedGPUMemory[id]; found {
res[id] = allMemory - usedMemory
} else {
res[id] = allMemory
}
}
return res
}
func (ni *NodeInfo) getDevicesUsedGPUMemory() map[int]uint {
res := map[int]uint{}
for _, device := range ni.GPUDevices {
res[device.ID] = device.getUsedGPUMemory()
}
return res
}
func (ni *NodeInfo) getDevicesAllGPUMemory() map[int]uint {
res := map[int]uint{}
for _, device := range ni.GPUDevices {
res[device.ID] = device.Memory
}
return res
}
// AddGPUResource adds the pod to GPU pool if it is assigned
func (ni *NodeInfo) AddGPUResource(pod *v1.Pod) {
gpuRes := GetGPUResourceOfPod(pod)
if gpuRes > 0 {
id := GetGPUIndex(pod)
if dev := ni.GPUDevices[id]; dev != nil {
dev.PodMap[string(pod.UID)] = pod
}
}
}
// SubGPUResource frees the gpu hold by the pod
func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) {
gpuRes := GetGPUResourceOfPod(pod)
if gpuRes > 0 {
id := GetGPUIndex(pod)
if dev := ni.GPUDevices[id]; dev != nil {
delete(dev.PodMap, string(pod.UID))
}
}
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"encoding/json"
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
)
// NumaChgFlag indicate node numainfo changed status
type NumaChgFlag int
const (
// NumaInfoResetFlag indicate reset operate
NumaInfoResetFlag NumaChgFlag = 0b00
// NumaInfoMoreFlag indicate the received allocatable resource is getting more
NumaInfoMoreFlag NumaChgFlag = 0b11
// NumaInfoLessFlag indicate the received allocatable resource is getting less
NumaInfoLessFlag NumaChgFlag = 0b10
)
// PodResourceDecision is resource allocation determinated by scheduler,
// and passed to kubelet through pod annotation.
type PodResourceDecision struct {
// NUMAResources is resource list with numa info indexed by numa id.
NUMAResources map[int]v1.ResourceList `json:"numa,omitempty"`
}
// ResourceInfo is the allocatable information for the resource
type ResourceInfo struct {
Allocatable cpuset.CPUSet
Capacity int
AllocatablePerNuma map[int]float64 // key: NUMA ID
UsedPerNuma map[int]float64 // key: NUMA ID
}
// NumatopoInfo is the information about topology manager on the node
type NumatopoInfo struct {
Namespace string
Name string
Policies map[nodeinfov1alpha1.PolicyName]string
NumaResMap map[string]*ResourceInfo
CPUDetail topology.CPUDetails
ResReserved v1.ResourceList
}
// DeepCopy used to copy NumatopoInfo
func (info *NumatopoInfo) DeepCopy() *NumatopoInfo {
numaInfo := &NumatopoInfo{
Namespace: info.Namespace,
Name: info.Name,
Policies: make(map[nodeinfov1alpha1.PolicyName]string),
NumaResMap: make(map[string]*ResourceInfo),
CPUDetail: topology.CPUDetails{},
ResReserved: make(v1.ResourceList),
}
policies := info.Policies
for name, policy := range policies {
numaInfo.Policies[name] = policy
}
for resName, resInfo := range info.NumaResMap {
tmpInfo := &ResourceInfo{
AllocatablePerNuma: make(map[int]float64),
UsedPerNuma: make(map[int]float64),
}
tmpInfo.Capacity = resInfo.Capacity
tmpInfo.Allocatable = resInfo.Allocatable.Clone()
for numaId, data := range resInfo.AllocatablePerNuma {
tmpInfo.AllocatablePerNuma[numaId] = data
}
for numaID, data := range resInfo.UsedPerNuma {
tmpInfo.UsedPerNuma[numaID] = data
}
numaInfo.NumaResMap[resName] = tmpInfo
}
cpuDetail := info.CPUDetail
for cpuID, detail := range cpuDetail {
numaInfo.CPUDetail[cpuID] = detail
}
resReserved := info.ResReserved
for resName, res := range resReserved {
numaInfo.ResReserved[resName] = res
}
return numaInfo
}
// Compare is the function to show the change of the resource on kubelet
// return val:
// - true : the resource on kubelet is getting more or no change
// - false : the resource on kubelet is getting less
func (info *NumatopoInfo) Compare(newInfo *NumatopoInfo) bool {
for resName := range info.NumaResMap {
oldSize := info.NumaResMap[resName].Allocatable.Size()
newSize := newInfo.NumaResMap[resName].Allocatable.Size()
if oldSize <= newSize {
return true
}
}
return false
}
// Allocate is the function to remove the allocated resource
func (info *NumatopoInfo) Allocate(resSets ResNumaSets) {
for resName := range resSets {
info.NumaResMap[resName].Allocatable = info.NumaResMap[resName].Allocatable.Difference(resSets[resName])
}
}
// Release is the function to reclaim the allocated resource
func (info *NumatopoInfo) Release(resSets ResNumaSets) {
for resName := range resSets {
info.NumaResMap[resName].Allocatable = info.NumaResMap[resName].Allocatable.Union(resSets[resName])
}
}
func GetPodResourceNumaInfo(ti *TaskInfo) map[int]v1.ResourceList {
if ti.NumaInfo != nil && len(ti.NumaInfo.ResMap) > 0 {
return ti.NumaInfo.ResMap
}
if _, ok := ti.Pod.Annotations[topologyDecisionAnnotation]; !ok {
return nil
}
decision := PodResourceDecision{}
err := json.Unmarshal([]byte(ti.Pod.Annotations[topologyDecisionAnnotation]), &decision)
if err != nil {
return nil
}
return decision.NUMAResources
}
// AddTask is the function to update the used resource of per numa node
func (info *NumatopoInfo) AddTask(ti *TaskInfo) {
numaInfo := GetPodResourceNumaInfo(ti)
if numaInfo == nil {
return
}
for numaID, resList := range numaInfo {
for resName, quantity := range resList {
info.NumaResMap[string(resName)].UsedPerNuma[numaID] += ResQuantity2Float64(resName, quantity)
}
}
}
// RemoveTask is the function to update the used resource of per numa node
func (info *NumatopoInfo) RemoveTask(ti *TaskInfo) {
decision := GetPodResourceNumaInfo(ti)
if decision == nil {
return
}
for numaID, resList := range ti.NumaInfo.ResMap {
for resName, quantity := range resList {
info.NumaResMap[string(resName)].UsedPerNuma[numaID] -= ResQuantity2Float64(resName, quantity)
}
}
}
// GenerateNodeResNumaSets return the idle resource sets of all node
func GenerateNodeResNumaSets(nodes map[string]*NodeInfo) map[string]ResNumaSets {
nodeSlice := make(map[string]ResNumaSets)
for _, node := range nodes {
if node.NumaSchedulerInfo == nil {
continue
}
resMaps := make(ResNumaSets)
for resName, resMap := range node.NumaSchedulerInfo.NumaResMap {
resMaps[resName] = resMap.Allocatable.Clone()
}
nodeSlice[node.Name] = resMaps
}
return nodeSlice
}
// GenerateNumaNodes return the numa IDs of all node
func GenerateNumaNodes(nodes map[string]*NodeInfo) map[string][]int {
nodeNumaMap := make(map[string][]int)
for _, node := range nodes {
if node.NumaSchedulerInfo == nil {
continue
}
nodeNumaMap[node.Name] = node.NumaSchedulerInfo.CPUDetail.NUMANodes().ToSlice()
}
return nodeNumaMap
}
// ResNumaSets is the set map of the resource
type ResNumaSets map[string]cpuset.CPUSet
// Allocate is to remove the allocated resource which is assigned to task
func (resSets ResNumaSets) Allocate(taskSets ResNumaSets) {
for resName := range taskSets {
if _, ok := resSets[resName]; !ok {
continue
}
resSets[resName] = resSets[resName].Difference(taskSets[resName])
}
}
// Release is to reclaim the allocated resource which is assigned to task
func (resSets ResNumaSets) Release(taskSets ResNumaSets) {
for resName := range taskSets {
if _, ok := resSets[resName]; !ok {
continue
}
resSets[resName] = resSets[resName].Union(taskSets[resName])
}
}
// Clone is the copy action
func (resSets ResNumaSets) Clone() ResNumaSets {
newSets := make(ResNumaSets)
for resName := range resSets {
newSets[resName] = resSets[resName].Clone()
}
return newSets
}
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"volcano.sh/apis/pkg/apis/scheduling"
)
// PodGroupPhase is the phase of a pod group at the current time.
type PodGroupPhase string
// These are the valid phase of podGroups.
const (
// PodGroupVersionV1Beta1 represents PodGroupVersion of v1beta1
PodGroupVersionV1Beta1 string = "v1beta1"
)
// PodGroup is a collection of Pod; used for batch workload.
type PodGroup struct {
scheduling.PodGroup
// Version represents the version of PodGroup
Version string
}
func (pg *PodGroup) Clone() *PodGroup {
return &PodGroup{
PodGroup: *pg.PodGroup.DeepCopy(),
Version: pg.Version,
}
}
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
// Refer k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/predicates.go#GetResourceRequest.
//
// GetResourceRequest returns a *Resource that covers the largest width in each resource dimension.
// Because init-containers run sequentially, we collect the max in each dimension iteratively.
// In contrast, we sum the resource vectors for regular containers since they run simultaneously.
//
// To be consistent with kubernetes default scheduler, it is only used for predicates of actions(e.g.
// allocate, backfill, preempt, reclaim), please use GetPodResourceWithoutInitContainers for other cases.
//
// Example:
//
// Pod:
// InitContainers
// IC1:
// CPU: 2
// Memory: 1G
// IC2:
// CPU: 2
// Memory: 3G
// Containers
// C1:
// CPU: 2
// Memory: 1G
// C2:
// CPU: 1
// Memory: 1G
//
// Result: CPU: 3, Memory: 3G
// GetPodResourceRequest returns all the resource required for that pod
func GetPodResourceRequest(pod *v1.Pod) *Resource {
result := GetPodResourceWithoutInitContainers(pod)
// take max_resource(sum_pod, any_init_container)
for _, container := range pod.Spec.InitContainers {
result.SetMaxResource(NewResource(container.Resources.Requests))
}
return result
}
// GetPodPreemptable return volcano.sh/preemptable value for pod
func GetPodPreemptable(pod *v1.Pod) bool {
// check annotaion first
if len(pod.Annotations) > 0 {
if value, found := pod.Annotations[v1beta1.PodPreemptable]; found {
b, err := strconv.ParseBool(value)
if err != nil {
klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
return false
}
return b
}
}
// it annotation does not exit, check label
if len(pod.Labels) > 0 {
if value, found := pod.Labels[v1beta1.PodPreemptable]; found {
b, err := strconv.ParseBool(value)
if err != nil {
klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
return false
}
return b
}
}
return false
}
// GetPodRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
func GetPodRevocableZone(pod *v1.Pod) string {
if len(pod.Annotations) > 0 {
if value, found := pod.Annotations[v1beta1.RevocableZone]; found {
if value != "*" {
return ""
}
return value
}
if value, found := pod.Annotations[v1beta1.PodPreemptable]; found {
if b, err := strconv.ParseBool(value); err == nil && b {
return "*"
}
}
}
return ""
}
// GetPodTopologyInfo return volcano.sh/numa-topology-policy value for pod
func GetPodTopologyInfo(pod *v1.Pod) *TopologyInfo {
info := TopologyInfo{
ResMap: make(map[int]v1.ResourceList),
}
if len(pod.Annotations) > 0 {
if value, found := pod.Annotations[v1beta1.NumaPolicyKey]; found {
info.Policy = value
}
if value, found := pod.Annotations[topologyDecisionAnnotation]; found {
decision := PodResourceDecision{}
err := json.Unmarshal([]byte(value), &decision)
if err == nil {
info.ResMap = decision.NUMAResources
}
}
}
return &info
}
// GetPodResourceWithoutInitContainers returns Pod's resource request, it does not contain
// init containers' resource request.
func GetPodResourceWithoutInitContainers(pod *v1.Pod) *Resource {
result := EmptyResource()
for _, container := range pod.Spec.Containers {
result.Add(NewResource(container.Resources.Requests))
}
return result
}
// GetGPUIndex returns the ID of the GPU
func GetGPUIndex(pod *v1.Pod) int {
if len(pod.Annotations) > 0 {
value, found := pod.Annotations[GPUIndex]
if found {
id, err := strconv.Atoi(value)
if err != nil {
klog.Errorf("invalid %s=%s", GPUIndex, value)
return -1
}
return id
}
}
return -1
}
func escapeJSONPointer(p string) string {
// Escaping reference name using https://tools.ietf.org/html/rfc6901
p = strings.Replace(p, "~", "~0", -1)
p = strings.Replace(p, "/", "~1", -1)
return p
}
// AddGPUIndexPatch returns the patch adding GPU index
func AddGPUIndexPatch(id int) string {
return fmt.Sprintf(`[{"op": "add", "path": "/metadata/annotations/%s", "value":"%d"},`+
`{"op": "add", "path": "/metadata/annotations/%s", "value": "%d"}]`,
escapeJSONPointer(PredicateTime), time.Now().UnixNano(),
escapeJSONPointer(GPUIndex), id)
}
// RemoveGPUIndexPatch returns the patch removing GPU index
func RemoveGPUIndexPatch() string {
return fmt.Sprintf(`[{"op": "remove", "path": "/metadata/annotations/%s"},`+
`{"op": "remove", "path": "/metadata/annotations/%s"}]`, escapeJSONPointer(PredicateTime), escapeJSONPointer(GPUIndex))
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"k8s.io/apimachinery/pkg/types"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
// QueueID is UID type, serves as unique ID for each queue
type QueueID types.UID
// QueueInfo will have all details about queue
type QueueInfo struct {
UID QueueID
Name string
Weight int32
// Weights is a list of slash sperated float numbers.
// Each of them is a weight corresponding the
// hierarchy level.
Weights string
// Hierarchy is a list of node name along the
// path from the root to the node itself.
Hierarchy string
Queue *scheduling.Queue
}
// NewQueueInfo creates new queueInfo object
func NewQueueInfo(queue *scheduling.Queue) *QueueInfo {
return &QueueInfo{
UID: QueueID(queue.Name),
Name: queue.Name,
Weight: queue.Spec.Weight,
Hierarchy: queue.Annotations[v1beta1.KubeHierarchyAnnotationKey],
Weights: queue.Annotations[v1beta1.KubeHierarchyWeightAnnotationKey],
Queue: queue,
}
}
// Clone is used to clone queueInfo object
func (q *QueueInfo) Clone() *QueueInfo {
return &QueueInfo{
UID: q.UID,
Name: q.Name,
Weight: q.Weight,
Hierarchy: q.Hierarchy,
Weights: q.Weights,
Queue: q.Queue,
}
}
// Reclaimable return whether queue is reclaimable
func (q *QueueInfo) Reclaimable() bool {
if q == nil {
return false
}
if q.Queue == nil {
return false
}
if q.Queue.Spec.Reclaimable == nil {
return true
}
return *q.Queue.Spec.Reclaimable
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"fmt"
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"volcano.sh/volcano/pkg/scheduler/util/assert"
)
const (
// GPUResourceName need to follow https://github.com/NVIDIA/k8s-device-plugin/blob/66a35b71ac4b5cbfb04714678b548bd77e5ba719/server.go#L20
GPUResourceName = "nvidia.com/gpu"
)
const (
minResource float64 = 0.1
)
// DimensionDefaultValue means default value for black resource dimension
type DimensionDefaultValue int
const (
// Zero means resource dimension not defined will be treated as zero
Zero DimensionDefaultValue = 0
// Infinity means resource dimension not defined will be treated as infinity
Infinity DimensionDefaultValue = -1
)
// Resource struct defines all the resource type
type Resource struct {
MilliCPU float64
Memory float64
// ScalarResources
ScalarResources map[v1.ResourceName]float64
// MaxTaskNum is only used by predicates; it should NOT
// be accounted in other operators, e.g. Add.
MaxTaskNum int
}
// EmptyResource creates a empty resource object and returns
func EmptyResource() *Resource {
return &Resource{}
}
// NewResource creates a new resource object from resource list
func NewResource(rl v1.ResourceList) *Resource {
r := EmptyResource()
for rName, rQuant := range rl {
switch rName {
case v1.ResourceCPU:
r.MilliCPU += float64(rQuant.MilliValue())
case v1.ResourceMemory:
r.Memory += float64(rQuant.Value())
case v1.ResourcePods:
r.MaxTaskNum += int(rQuant.Value())
default:
//NOTE: When converting this back to k8s resource, we need record the format as well as / 1000
if v1helper.IsScalarResourceName(rName) {
r.AddScalar(rName, float64(rQuant.MilliValue()))
}
}
}
return r
}
// ResFloat642Quantity transform resource quantity
func ResFloat642Quantity(resName v1.ResourceName, quantity float64) resource.Quantity {
var resQuantity *resource.Quantity
switch resName {
case v1.ResourceCPU:
resQuantity = resource.NewMilliQuantity(int64(quantity), resource.DecimalSI)
default:
resQuantity = resource.NewQuantity(int64(quantity), resource.BinarySI)
}
return *resQuantity
}
// ResQuantity2Float64 transform resource quantity
func ResQuantity2Float64(resName v1.ResourceName, quantity resource.Quantity) float64 {
var resQuantity float64
switch resName {
case v1.ResourceCPU:
resQuantity = float64(quantity.MilliValue())
default:
resQuantity = float64(quantity.Value())
}
return resQuantity
}
// Clone is used to clone a resource type, which is a deep copy function.
func (r *Resource) Clone() *Resource {
clone := &Resource{
MilliCPU: r.MilliCPU,
Memory: r.Memory,
MaxTaskNum: r.MaxTaskNum,
}
if r.ScalarResources != nil {
clone.ScalarResources = make(map[v1.ResourceName]float64)
for k, v := range r.ScalarResources {
clone.ScalarResources[k] = v
}
}
return clone
}
// String returns resource details in string format
func (r *Resource) String() string {
str := fmt.Sprintf("cpu %0.2f, memory %0.2f", r.MilliCPU, r.Memory)
for rName, rQuant := range r.ScalarResources {
str = fmt.Sprintf("%s, %s %0.2f", str, rName, rQuant)
}
return str
}
// ResourceNames returns all resource types
func (r *Resource) ResourceNames() ResourceNameList {
resNames := ResourceNameList{}
if r.MilliCPU >= minResource {
resNames = append(resNames, v1.ResourceCPU)
}
if r.Memory >= minResource {
resNames = append(resNames, v1.ResourceMemory)
}
for rName, rMount := range r.ScalarResources {
if rMount >= minResource {
resNames = append(resNames, rName)
}
}
return resNames
}
// Get returns the resource value for that particular resource type
func (r *Resource) Get(rn v1.ResourceName) float64 {
switch rn {
case v1.ResourceCPU:
return r.MilliCPU
case v1.ResourceMemory:
return r.Memory
default:
if r.ScalarResources == nil {
return 0
}
return r.ScalarResources[rn]
}
}
// IsEmpty returns false if any kind of resource is not less than min value, otherwise returns true
func (r *Resource) IsEmpty() bool {
if !(r.MilliCPU < minResource && r.Memory < minResource) {
return false
}
for _, rQuant := range r.ScalarResources {
if rQuant >= minResource {
return false
}
}
return true
}
// IsZero returns false if the given kind of resource is not less than min value
func (r *Resource) IsZero(rn v1.ResourceName) bool {
switch rn {
case v1.ResourceCPU:
return r.MilliCPU < minResource
case v1.ResourceMemory:
return r.Memory < minResource
default:
if r.ScalarResources == nil {
return true
}
_, found := r.ScalarResources[rn]
assert.Assertf(found, "unknown resource %s", rn)
return r.ScalarResources[rn] < minResource
}
}
// Add is used to add two given resources
func (r *Resource) Add(rr *Resource) *Resource {
r.MilliCPU += rr.MilliCPU
r.Memory += rr.Memory
for rName, rQuant := range rr.ScalarResources {
if r.ScalarResources == nil {
r.ScalarResources = map[v1.ResourceName]float64{}
}
r.ScalarResources[rName] += rQuant
}
return r
}
// Sub subtracts two Resource objects with assertion.
func (r *Resource) Sub(rr *Resource) *Resource {
assert.Assertf(rr.LessEqual(r, Zero), "resource is not sufficient to do operation: <%v> sub <%v>", r, rr)
return r.sub(rr)
}
// sub subtracts two Resource objects.
func (r *Resource) sub(rr *Resource) *Resource {
r.MilliCPU -= rr.MilliCPU
r.Memory -= rr.Memory
if r.ScalarResources == nil {
return r
}
for rrName, rrQuant := range rr.ScalarResources {
r.ScalarResources[rrName] -= rrQuant
}
return r
}
// Multi multiples the resource with ratio provided
func (r *Resource) Multi(ratio float64) *Resource {
r.MilliCPU *= ratio
r.Memory *= ratio
for rName, rQuant := range r.ScalarResources {
r.ScalarResources[rName] = rQuant * ratio
}
return r
}
// SetMaxResource compares with ResourceList and takes max value for each Resource.
func (r *Resource) SetMaxResource(rr *Resource) {
if r == nil || rr == nil {
return
}
if rr.MilliCPU > r.MilliCPU {
r.MilliCPU = rr.MilliCPU
}
if rr.Memory > r.Memory {
r.Memory = rr.Memory
}
for rrName, rrQuant := range rr.ScalarResources {
if r.ScalarResources == nil {
r.ScalarResources = make(map[v1.ResourceName]float64)
for k, v := range rr.ScalarResources {
r.ScalarResources[k] = v
}
return
}
_, ok := r.ScalarResources[rrName]
if !ok || rrQuant > r.ScalarResources[rrName] {
r.ScalarResources[rrName] = rrQuant
}
}
}
//FitDelta Computes the delta between a resource object representing available
//resources an operand representing resources being requested. Any
//field that is less than 0 after the operation represents an
//insufficient resource.
func (r *Resource) FitDelta(rr *Resource) *Resource {
if rr.MilliCPU > 0 {
r.MilliCPU -= rr.MilliCPU + minResource
}
if rr.Memory > 0 {
r.Memory -= rr.Memory + minResource
}
if r.ScalarResources == nil {
r.ScalarResources = make(map[v1.ResourceName]float64)
}
for rrName, rrQuant := range rr.ScalarResources {
if rrQuant > 0 {
_, ok := r.ScalarResources[rrName]
if !ok {
r.ScalarResources[rrName] = 0
}
r.ScalarResources[rrName] -= rrQuant + minResource
}
}
return r
}
// Less returns true only on condition that all dimensions of resources in r are less than that of rr,
// Otherwise returns false.
// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
func (r *Resource) Less(rr *Resource, defaultValue DimensionDefaultValue) bool {
lessFunc := func(l, r float64) bool {
return l < r
}
if !lessFunc(r.MilliCPU, rr.MilliCPU) {
return false
}
if !lessFunc(r.Memory, rr.Memory) {
return false
}
for resourceName, leftValue := range r.ScalarResources {
rightValue, ok := rr.ScalarResources[resourceName]
if !ok && defaultValue == Infinity {
continue
}
if !lessFunc(leftValue, rightValue) {
return false
}
}
return true
}
// LessEqual returns true only on condition that all dimensions of resources in r are less than or equal with that of rr,
// Otherwise returns false.
// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) bool {
lessEqualFunc := func(l, r, diff float64) bool {
if l < r || math.Abs(l-r) < diff {
return true
}
return false
}
if !lessEqualFunc(r.MilliCPU, rr.MilliCPU, minResource) {
return false
}
if !lessEqualFunc(r.Memory, rr.Memory, minResource) {
return false
}
for resourceName, leftValue := range r.ScalarResources {
rightValue, ok := rr.ScalarResources[resourceName]
if !ok && defaultValue == Infinity {
continue
}
if !lessEqualFunc(leftValue, rightValue, minResource) {
return false
}
}
return true
}
// LessPartly returns true if there exists any dimension whose resource amount in r is less than that in rr.
// Otherwise returns false.
// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
func (r *Resource) LessPartly(rr *Resource, defaultValue DimensionDefaultValue) bool {
lessFunc := func(l, r float64) bool {
return l < r
}
if lessFunc(r.MilliCPU, rr.MilliCPU) || lessFunc(r.Memory, rr.Memory) {
return true
}
for resourceName, leftValue := range r.ScalarResources {
rightValue, ok := rr.ScalarResources[resourceName]
if !ok && defaultValue == Infinity {
return true
}
if lessFunc(leftValue, rightValue) {
return true
}
}
return false
}
// LessEqualPartly returns true if there exists any dimension whose resource amount in r is less than or equal with that in rr.
// Otherwise returns false.
// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
func (r *Resource) LessEqualPartly(rr *Resource, defaultValue DimensionDefaultValue) bool {
lessEqualFunc := func(l, r, diff float64) bool {
if l < r || math.Abs(l-r) < diff {
return true
}
return false
}
if lessEqualFunc(r.MilliCPU, rr.MilliCPU, minResource) || lessEqualFunc(r.Memory, rr.Memory, minResource) {
return true
}
for resourceName, leftValue := range r.ScalarResources {
rightValue, ok := rr.ScalarResources[resourceName]
if !ok && defaultValue == Infinity {
return true
}
if lessEqualFunc(leftValue, rightValue, minResource) {
return true
}
}
return false
}
// Equal returns true only on condition that values in all dimension are equal with each other for r and rr
// Otherwise returns false.
// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
func (r *Resource) Equal(rr *Resource, defaultValue DimensionDefaultValue) bool {
equalFunc := func(l, r, diff float64) bool {
return l == r || math.Abs(l-r) < diff
}
if !equalFunc(r.MilliCPU, rr.MilliCPU, minResource) || !equalFunc(r.Memory, rr.Memory, minResource) {
return false
}
for resourceName, leftValue := range r.ScalarResources {
rightValue := rr.ScalarResources[resourceName]
if !equalFunc(leftValue, rightValue, minResource) {
return false
}
}
return true
}
// Diff calculate the difference between two resource object
// Note: if `defaultValue` equals `Infinity`, the difference between two values will be `Infinity`, marked as -1
func (r *Resource) Diff(rr *Resource, defaultValue DimensionDefaultValue) (*Resource, *Resource) {
leftRes := r.Clone()
rightRes := rr.Clone()
increasedVal := EmptyResource()
decreasedVal := EmptyResource()
r.setDefaultValue(leftRes, rightRes, defaultValue)
if leftRes.MilliCPU > rightRes.MilliCPU {
increasedVal.MilliCPU = leftRes.MilliCPU - rightRes.MilliCPU
} else {
decreasedVal.MilliCPU = rightRes.MilliCPU - leftRes.MilliCPU
}
if leftRes.Memory > rightRes.Memory {
increasedVal.Memory = leftRes.Memory - rightRes.Memory
} else {
decreasedVal.Memory = rightRes.Memory - leftRes.Memory
}
increasedVal.ScalarResources = make(map[v1.ResourceName]float64)
decreasedVal.ScalarResources = make(map[v1.ResourceName]float64)
for lName, lQuant := range leftRes.ScalarResources {
rQuant := rightRes.ScalarResources[lName]
if lQuant == -1 {
increasedVal.ScalarResources[lName] = -1
continue
}
if rQuant == -1 {
decreasedVal.ScalarResources[lName] = -1
continue
}
if lQuant > rQuant {
increasedVal.ScalarResources[lName] = lQuant - rQuant
} else {
decreasedVal.ScalarResources[lName] = rQuant - lQuant
}
}
return increasedVal, decreasedVal
}
// AddScalar adds a resource by a scalar value of this resource.
func (r *Resource) AddScalar(name v1.ResourceName, quantity float64) {
r.SetScalar(name, r.ScalarResources[name]+quantity)
}
// SetScalar sets a resource by a scalar value of this resource.
func (r *Resource) SetScalar(name v1.ResourceName, quantity float64) {
// Lazily allocate scalar resource map.
if r.ScalarResources == nil {
r.ScalarResources = map[v1.ResourceName]float64{}
}
r.ScalarResources[name] = quantity
}
// MinDimensionResource is used to reset the r resource dimension which is less than rr
// e.g r resource is <cpu 2000.00, memory 4047845376.00, hugepages-2Mi 0.00, hugepages-1Gi 0.00>
// rr resource is <cpu 3000.00, memory 1000.00>
// return r resource is <cpu 2000.00, memory 1000.00, hugepages-2Mi 0.00, hugepages-1Gi 0.00>
// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
func (r *Resource) MinDimensionResource(rr *Resource, defaultValue DimensionDefaultValue) *Resource {
if rr.MilliCPU < r.MilliCPU {
r.MilliCPU = rr.MilliCPU
}
if rr.Memory < r.Memory {
r.Memory = rr.Memory
}
if r.ScalarResources == nil {
return r
}
if rr.ScalarResources == nil {
if defaultValue == Infinity {
return r
}
for name := range r.ScalarResources {
r.ScalarResources[name] = 0
}
return r
}
for name, quant := range r.ScalarResources {
rQuant, ok := rr.ScalarResources[name]
if ok {
r.ScalarResources[name] = math.Min(quant, rQuant)
} else {
if defaultValue == Infinity {
continue
}
r.ScalarResources[name] = 0
}
}
return r
}
// setDefaultValue sets default value for resource dimension not defined of ScalarResource in leftResource and rightResource
// @param defaultValue "default value for resource dimension not defined in ScalarResources. It can only be one of 'Zero' or 'Infinity'"
func (r *Resource) setDefaultValue(leftResource, rightResource *Resource, defaultValue DimensionDefaultValue) {
if leftResource.ScalarResources == nil {
leftResource.ScalarResources = map[v1.ResourceName]float64{}
}
if rightResource.ScalarResources == nil {
rightResource.ScalarResources = map[v1.ResourceName]float64{}
}
for resourceName := range leftResource.ScalarResources {
_, ok := rightResource.ScalarResources[resourceName]
if !ok {
if defaultValue == Zero {
rightResource.ScalarResources[resourceName] = 0
} else if defaultValue == Infinity {
rightResource.ScalarResources[resourceName] = -1
}
}
}
for resourceName := range rightResource.ScalarResources {
_, ok := leftResource.ScalarResources[resourceName]
if !ok {
if defaultValue == Zero {
leftResource.ScalarResources[resourceName] = 0
} else if defaultValue == Infinity {
leftResource.ScalarResources[resourceName] = -1
}
}
}
}
// ParseResourceList parses the given configuration map into an API
// ResourceList or returns an error.
func ParseResourceList(m map[string]string) (v1.ResourceList, error) {
if len(m) == 0 {
return nil, nil
}
rl := make(v1.ResourceList)
for k, v := range m {
switch v1.ResourceName(k) {
// CPU, memory, local storage, and PID resources are supported.
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage:
q, err := resource.ParseQuantity(v)
if err != nil {
return nil, err
}
if q.Sign() == -1 {
return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
}
rl[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
}
return rl, nil
}
func GetMinResource() float64 {
return minResource
}
// ResourceNameList struct defines resource name collection
type ResourceNameList []v1.ResourceName
// Contains judges whether rr is subset of r
func (r ResourceNameList) Contains(rr ResourceNameList) bool {
for _, rrName := range ([]v1.ResourceName)(rr) {
isResourceExist := false
for _, rName := range ([]v1.ResourceName)(r) {
if rName == rrName {
isResourceExist = true
break
}
}
if !isResourceExist {
return false
}
}
return true
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"k8s.io/apimachinery/pkg/types"
"volcano.sh/apis/pkg/apis/scheduling"
)
// ClusterID is UID type, serves as unique ID for each queue
type ClusterID types.UID
// SiloClusterInfo will have all details about queue
type SiloClusterInfo struct {
UID ClusterID
Cluster *scheduling.Cluster
}
// NewSiloClusterInfo creates new queueInfo object
func NewSiloClusterInfo(cluster *scheduling.Cluster) *SiloClusterInfo {
return &SiloClusterInfo{
UID: ClusterID(cluster.Name),
Cluster: cluster,
}
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)
func buildNode(name string, alloc v1.ResourceList) *v1.Node {
return &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Status: v1.NodeStatus{
Capacity: alloc,
Allocatable: alloc,
},
}
}
func buildPod(ns, n, nn string, p v1.PodPhase, req v1.ResourceList, owner []metav1.OwnerReference, labels map[string]string) *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: types.UID(fmt.Sprintf("%v-%v", ns, n)),
Name: n,
Namespace: ns,
OwnerReferences: owner,
Labels: labels,
},
Status: v1.PodStatus{
Phase: p,
},
Spec: v1.PodSpec{
NodeName: nn,
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: req,
},
},
},
},
}
}
func buildResourceList(cpu string, memory string) v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: resource.MustParse(cpu),
v1.ResourceMemory: resource.MustParse(memory),
}
}
func buildResource(cpu string, memory string) *Resource {
return NewResource(v1.ResourceList{
v1.ResourceCPU: resource.MustParse(cpu),
v1.ResourceMemory: resource.MustParse(memory),
})
}
func buildOwnerReference(owner string) metav1.OwnerReference {
controller := true
return metav1.OwnerReference{
Controller: &controller,
UID: types.UID(owner),
}
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package api
import (
k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
)
// TaskStatus defines the status of a task/pod.
type TaskStatus int
const (
// Pending means the task is pending in the apiserver.
Pending TaskStatus = 1 << iota
// Allocated means the scheduler assigns a host to it.
Allocated
// Pipelined means the scheduler assigns a host to wait for releasing resource.
Pipelined
// Binding means the scheduler send Bind request to apiserver.
Binding
// Bound means the task/Pod bounds to a host.
Bound
// Running means a task is running on the host.
Running
// Releasing means a task/pod is deleted.
Releasing
// Succeeded means that all containers in the pod have voluntarily terminated
// with a container exit code of 0, and the system is not going to restart any of these containers.
Succeeded
// Failed means that all containers in the pod have terminated, and at least one container has
// terminated in a failure (exited with a non-zero exit code or was stopped by the system).
Failed
// Unknown means the status of task/pod is unknown to the scheduler.
Unknown
)
func (ts TaskStatus) String() string {
switch ts {
case Pending:
return "Pending"
case Allocated:
return "Allocated"
case Pipelined:
return "Pipelined"
case Binding:
return "Binding"
case Bound:
return "Bound"
case Running:
return "Running"
case Releasing:
return "Releasing"
case Succeeded:
return "Succeeded"
case Failed:
return "Failed"
default:
return "Unknown"
}
}
// NodePhase defines the phase of node
type NodePhase int
const (
// Ready means the node is ready for scheduling
Ready NodePhase = 1 << iota
// NotReady means the node is not ready for scheduling
NotReady
)
func (np NodePhase) String() string {
switch np {
case Ready:
return "Ready"
case NotReady:
return "NotReady"
}
return "Unknown"
}
// validateStatusUpdate validates whether the status transfer is valid.
func validateStatusUpdate(oldStatus, newStatus TaskStatus) error {
return nil
}
// LessFn is the func declaration used by sort or priority queue.
type LessFn func(interface{}, interface{}) bool
// CompareFn is the func declaration used by sort or priority queue.
type CompareFn func(interface{}, interface{}) int
// ValidateFn is the func declaration used to check object's status.
type ValidateFn func(interface{}) bool
// ValidateResult is struct to which can used to determine the result
type ValidateResult struct {
Pass bool
Reason string
Message string
}
// ValidateExFn is the func declaration used to validate the result.
type ValidateExFn func(interface{}) *ValidateResult
// VoteFn is the func declaration used to check object's complicated status.
type VoteFn func(interface{}) int
// JobEnqueuedFn is the func declaration used to call after job enqueued.
type JobEnqueuedFn func(interface{})
// PredicateFn is the func declaration used to predicate node for task.
type PredicateFn func(*TaskInfo, *NodeInfo) error
// BestNodeFn is the func declaration used to return the nodeScores to plugins.
type BestNodeFn func(*TaskInfo, map[float64][]*NodeInfo) *NodeInfo
// EvictableFn is the func declaration used to evict tasks.
type EvictableFn func(*TaskInfo, []*TaskInfo) ([]*TaskInfo, int)
// NodeOrderFn is the func declaration used to get priority score for a node for a particular task.
type NodeOrderFn func(*TaskInfo, *NodeInfo) (float64, error)
// BatchNodeOrderFn is the func declaration used to get priority score for ALL nodes for a particular task.
type BatchNodeOrderFn func(*TaskInfo, []*NodeInfo) (map[string]float64, error)
// NodeMapFn is the func declaration used to get priority score for a node for a particular task.
type NodeMapFn func(*TaskInfo, *NodeInfo) (float64, error)
// NodeReduceFn is the func declaration used to reduce priority score for a node for a particular task.
type NodeReduceFn func(*TaskInfo, k8sframework.NodeScoreList) error
// NodeOrderMapFn is the func declaration used to get priority score of all plugins for a node for a particular task.
type NodeOrderMapFn func(*TaskInfo, *NodeInfo) (map[string]float64, float64, error)
// NodeOrderReduceFn is the func declaration used to reduce priority score of all nodes for a plugin for a particular task.
type NodeOrderReduceFn func(*TaskInfo, map[string]k8sframework.NodeScoreList) (map[string]float64, error)
// TargetJobFn is the func declaration used to select the target job satisfies some conditions
type TargetJobFn func([]*JobInfo) *JobInfo
// ReservedNodesFn is the func declaration used to select the reserved nodes
type ReservedNodesFn func()
// VictimTasksFn is the func declaration used to select victim tasks
type VictimTasksFn func() []*TaskInfo
// UnderUsedResourceFn is the func declaration used to get under used resource list for queue
type UnderUsedResourceFn func(*QueueInfo) ResourceNameList
package api
import (
"fmt"
"sort"
"strings"
)
const (
// NodePodNumberExceeded means pods in node exceed the allocatable pod number
NodePodNumberExceeded = "node(s) pod number exceeded"
// NodeResourceFitFailed means node could not fit the request of pod
NodeResourceFitFailed = "node(s) resource fit failed"
// AllNodeUnavailableMsg is the default error message
AllNodeUnavailableMsg = "all nodes are unavailable"
)
// These are reasons for a pod's transition to a condition.
const (
// PodReasonUnschedulable reason in PodScheduled PodCondition means that the scheduler
// can't schedule the pod right now, for example due to insufficient resources in the cluster.
PodReasonUnschedulable = "Unschedulable"
// PodReasonSchedulable reason in PodScheduled PodCondition means that the scheduler
// can schedule the pod right now, but not bind yet
PodReasonSchedulable = "Schedulable"
// PodReasonUndetermined reason in PodScheduled PodCondition means that the scheduler
// skips scheduling the pod which left the pod `Undetermined`, for example due to unschedulable pod already occurred.
PodReasonUndetermined = "Undetermined"
)
// FitErrors is set of FitError on many nodes
type FitErrors struct {
nodes map[string]*FitError
err string
}
// NewFitErrors returns an FitErrors
func NewFitErrors() *FitErrors {
f := new(FitErrors)
f.nodes = make(map[string]*FitError)
return f
}
// SetError set the common error message in FitErrors
func (f *FitErrors) SetError(err string) {
f.err = err
}
// SetNodeError set the node error in FitErrors
func (f *FitErrors) SetNodeError(nodeName string, err error) {
var fe *FitError
switch obj := err.(type) {
case *FitError:
obj.NodeName = nodeName
fe = obj
default:
fe = &FitError{
NodeName: nodeName,
Reasons: []string{obj.Error()},
}
}
f.nodes[nodeName] = fe
}
// Error returns the final error message
func (f *FitErrors) Error() string {
reasons := make(map[string]int)
for _, node := range f.nodes {
for _, reason := range node.Reasons {
reasons[reason]++
}
}
sortReasonsHistogram := func() []string {
reasonStrings := []string{}
for k, v := range reasons {
reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
}
sort.Strings(reasonStrings)
return reasonStrings
}
if f.err == "" {
f.err = AllNodeUnavailableMsg
}
reasonMsg := fmt.Sprintf(f.err+": %v.", strings.Join(sortReasonsHistogram(), ", "))
return reasonMsg
}
// FitError describe the reason why task could not fit that node
type FitError struct {
taskNamespace string
taskName string
NodeName string
Reasons []string
}
// NewFitError return FitError by message
func NewFitError(task *TaskInfo, node *NodeInfo, message ...string) *FitError {
fe := &FitError{
taskName: task.Name,
taskNamespace: task.Namespace,
NodeName: node.Name,
Reasons: message,
}
return fe
}
// Error returns the final error message
func (f *FitError) Error() string {
return fmt.Sprintf("task %s/%s on node %s fit failed: %s", f.taskNamespace, f.taskName, f.NodeName, strings.Join(f.Reasons, ", "))
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"context"
"fmt"
"os"
"strconv"
"strings"
"sync"
"time"
v1 "k8s.io/api/core/v1"
schedulingv1 "k8s.io/api/scheduling/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
infov1 "k8s.io/client-go/informers/core/v1"
schedv1 "k8s.io/client-go/informers/scheduling/v1"
storagev1 "k8s.io/client-go/informers/storage/v1"
storagev1alpha1 "k8s.io/client-go/informers/storage/v1alpha1"
"k8s.io/client-go/kubernetes"
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/apis/pkg/apis/scheduling"
schedulingscheme "volcano.sh/apis/pkg/apis/scheduling/scheme"
vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
vcclient "volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/apis/pkg/client/clientset/versioned/scheme"
vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
cpuinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/nodeinfo/v1alpha1"
vcinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
"volcano.sh/volcano/cmd/scheduler/app/options"
schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/metrics"
)
func init() {
schemeBuilder := runtime.SchemeBuilder{
v1.AddToScheme,
}
utilruntime.Must(schemeBuilder.AddToScheme(scheme.Scheme))
}
// New returns a Cache implementation.
func New(config *rest.Config, schedulerName string, defaultQueue string, nodeSelectors []string) Cache {
return newSchedulerCache(config, schedulerName, defaultQueue, nodeSelectors)
}
// SchedulerCache cache for the kube batch
type SchedulerCache struct {
sync.Mutex
kubeClient *kubernetes.Clientset
vcClient *vcclient.Clientset
defaultQueue string
// schedulerName is the name for volcano scheduler
schedulerName string
nodeSelectorLabels map[string]string
podInformer infov1.PodInformer
nodeInformer infov1.NodeInformer
podGroupInformerV1beta1 vcinformerv1.PodGroupInformer
queueInformerV1beta1 vcinformerv1.QueueInformer
pvInformer infov1.PersistentVolumeInformer
pvcInformer infov1.PersistentVolumeClaimInformer
scInformer storagev1.StorageClassInformer
pcInformer schedv1.PriorityClassInformer
quotaInformer infov1.ResourceQuotaInformer
csiNodeInformer storagev1.CSINodeInformer
csiDriverInformer storagev1.CSIDriverInformer
csiStorageCapacityInformer storagev1alpha1.CSIStorageCapacityInformer
cpuInformer cpuinformerv1.NumatopologyInformer
Binder Binder
Evictor Evictor
StatusUpdater StatusUpdater
PodGroupBinder BatchBinder
VolumeBinder VolumeBinder
Recorder record.EventRecorder
Jobs map[schedulingapi.JobID]*schedulingapi.JobInfo
Nodes map[string]*schedulingapi.NodeInfo
Queues map[schedulingapi.QueueID]*schedulingapi.QueueInfo
PriorityClasses map[string]*schedulingv1.PriorityClass
NodeList []string
defaultPriorityClass *schedulingv1.PriorityClass
defaultPriority int32
NamespaceCollection map[string]*schedulingapi.NamespaceCollection
errTasks workqueue.RateLimitingInterface
deletedJobs workqueue.RateLimitingInterface
informerFactory informers.SharedInformerFactory
vcInformerFactory vcinformer.SharedInformerFactory
BindFlowChannel chan *schedulingapi.TaskInfo
bindCache []*schedulingapi.TaskInfo
batchNum int
}
type defaultBinder struct {
kubeclient *kubernetes.Clientset
}
//Bind will send bind request to api server
func (db *defaultBinder) Bind(kubeClient *kubernetes.Clientset, tasks []*schedulingapi.TaskInfo) (error, []*schedulingapi.TaskInfo) {
var errTasks []*schedulingapi.TaskInfo
for _, task := range tasks {
p := task.Pod
if err := kubeClient.CoreV1().Pods(p.Namespace).Bind(context.TODO(),
&v1.Binding{
ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID, Annotations: p.Annotations},
Target: v1.ObjectReference{
Kind: "Node",
Name: task.NodeName,
},
},
metav1.CreateOptions{}); err != nil {
klog.Errorf("Failed to bind pod <%v/%v> to node %s : %#v", p.Namespace, p.Name, task.NodeName, err)
errTasks = append(errTasks, task)
}
}
if len(errTasks) > 0 {
return fmt.Errorf("failed to bind pods"), errTasks
}
return nil, nil
}
func NewBinder() *defaultBinder {
return &defaultBinder{}
}
type defaultEvictor struct {
kubeclient *kubernetes.Clientset
recorder record.EventRecorder
}
// Evict will send delete pod request to api server
func (de *defaultEvictor) Evict(p *v1.Pod, reason string) error {
klog.V(3).Infof("Evicting pod %v/%v, because of %v", p.Namespace, p.Name, reason)
evictMsg := fmt.Sprintf("Pod is evicted, because of %v", reason)
annotations := map[string]string{}
// record that we are evicting the pod
de.recorder.AnnotatedEventf(p, annotations, v1.EventTypeWarning, "Evict", evictMsg)
pod := p.DeepCopy()
condition := &v1.PodCondition{
Type: v1.PodReady,
Status: v1.ConditionFalse,
Reason: "Evict",
Message: evictMsg,
}
if !podutil.UpdatePodCondition(&pod.Status, condition) {
klog.V(1).Infof("UpdatePodCondition: existed condition, not update")
klog.V(1).Infof("%+v", pod.Status.Conditions)
return nil
}
if _, err := de.kubeclient.CoreV1().Pods(p.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
klog.Errorf("Failed to update pod <%v/%v> status: %v", pod.Namespace, pod.Name, err)
return err
}
if err := de.kubeclient.CoreV1().Pods(p.Namespace).Delete(context.TODO(), p.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("Failed to evict pod <%v/%v>: %#v", p.Namespace, p.Name, err)
return err
}
return nil
}
// defaultStatusUpdater is the default implementation of the StatusUpdater interface
type defaultStatusUpdater struct {
kubeclient *kubernetes.Clientset
vcclient *vcclient.Clientset
}
// following the same logic as podutil.UpdatePodCondition
func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool {
lastTransitionTime := metav1.Now()
// Try to find this pod condition.
_, oldCondition := podutil.GetPodCondition(status, condition.Type)
if oldCondition == nil {
// We are adding new pod condition.
return true
}
// We are updating an existing condition, so we need to check if it has changed.
if condition.Status == oldCondition.Status {
lastTransitionTime = oldCondition.LastTransitionTime
}
isEqual := condition.Status == oldCondition.Status &&
condition.Reason == oldCondition.Reason &&
condition.Message == oldCondition.Message &&
condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) &&
lastTransitionTime.Equal(&oldCondition.LastTransitionTime)
// Return true if one of the fields have changed.
return !isEqual
}
// UpdatePodCondition will Update pod with podCondition
func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) {
klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
if podutil.UpdatePodCondition(&pod.Status, condition) {
return su.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
}
return pod, nil
}
// UpdatePodGroup will Update pod with podCondition
func (su *defaultStatusUpdater) UpdatePodGroup(pg *schedulingapi.PodGroup) (*schedulingapi.PodGroup, error) {
podgroup := &vcv1beta1.PodGroup{}
if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
return nil, err
}
updated, err := su.vcclient.SchedulingV1beta1().PodGroups(podgroup.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Error while updating PodGroup with error: %v", err)
return nil, err
}
podGroupInfo := &schedulingapi.PodGroup{Version: schedulingapi.PodGroupVersionV1Beta1}
if err := schedulingscheme.Scheme.Convert(updated, &podGroupInfo.PodGroup, nil); err != nil {
klog.Errorf("Error while converting v1alpha.PodGroup to api.PodGroup with error: %v", err)
return nil, err
}
return podGroupInfo, nil
}
type defaultVolumeBinder struct {
volumeBinder volumescheduling.SchedulerVolumeBinder
}
// AllocateVolumes allocates volume on the host to the task
func (dvb *defaultVolumeBinder) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
allBound, err := dvb.volumeBinder.AssumePodVolumes(task.Pod, hostname, podVolumes)
task.VolumeReady = allBound
return err
}
// GetPodVolumes get pod volume on the host
func (dvb *defaultVolumeBinder) GetPodVolumes(task *schedulingapi.TaskInfo,
node *v1.Node) (podVolumes *volumescheduling.PodVolumes, err error) {
boundClaims, claimsToBind, _, err := dvb.volumeBinder.GetPodVolumes(task.Pod)
if err != nil {
return nil, err
}
podVolumes, _, err = dvb.volumeBinder.FindPodVolumes(task.Pod, boundClaims, claimsToBind, node)
return podVolumes, err
}
// BindVolumes binds volumes to the task
func (dvb *defaultVolumeBinder) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
// If task's volumes are ready, did not bind them again.
if task.VolumeReady {
return nil
}
return dvb.volumeBinder.BindPodVolumes(task.Pod, podVolumes)
}
type podgroupBinder struct {
kubeclient *kubernetes.Clientset
vcclient *vcclient.Clientset
}
// Bind will add silo cluster annotaion on pod and podgroup
func (pgb *podgroupBinder) Bind(job *schedulingapi.JobInfo, cluster string) (*schedulingapi.JobInfo, error) {
if len(job.Tasks) == 0 {
klog.V(4).Infof("Job pods have not been created yet")
return job, nil
}
for _, task := range job.Tasks {
pod := task.Pod
pod.Annotations[batch.ForwardClusterKey] = cluster
pod.ResourceVersion = ""
_, err := pgb.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Error while update pod annotation with error: %v", err)
return nil, err
}
}
pg := job.PodGroup
pg.Annotations[batch.ForwardClusterKey] = cluster
podgroup := &vcv1beta1.PodGroup{}
if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
return nil, err
}
newPg, err := pgb.vcclient.SchedulingV1beta1().PodGroups(pg.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Error while update PodGroup annotation with error: %v", err)
return nil, err
}
job.PodGroup.ResourceVersion = newPg.ResourceVersion
klog.V(4).Infof("Bind PodGroup <%s> successfully", job.PodGroup.Name)
return job, nil
}
func newSchedulerCache(config *rest.Config, schedulerName string, defaultQueue string, nodeSelectors []string) *SchedulerCache {
kubeClient, err := kubernetes.NewForConfig(config)
if err != nil {
panic(fmt.Sprintf("failed init kubeClient, with err: %v", err))
}
vcClient, err := vcclient.NewForConfig(config)
if err != nil {
panic(fmt.Sprintf("failed init vcClient, with err: %v", err))
}
eventClient, err := kubernetes.NewForConfig(config)
if err != nil {
panic(fmt.Sprintf("failed init eventClient, with err: %v", err))
}
// create default queue
reclaimable := true
defaultQue := vcv1beta1.Queue{
ObjectMeta: metav1.ObjectMeta{
Name: defaultQueue,
},
Spec: vcv1beta1.QueueSpec{
Reclaimable: &reclaimable,
Weight: 1,
},
}
if _, err := vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), &defaultQue, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) {
panic(fmt.Sprintf("failed init default queue, with err: %v", err))
}
sc := &SchedulerCache{
Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
Nodes: make(map[string]*schedulingapi.NodeInfo),
Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
PriorityClasses: make(map[string]*schedulingv1.PriorityClass),
errTasks: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
deletedJobs: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
kubeClient: kubeClient,
vcClient: vcClient,
defaultQueue: defaultQueue,
schedulerName: schedulerName,
nodeSelectorLabels: make(map[string]string),
NamespaceCollection: make(map[string]*schedulingapi.NamespaceCollection),
NodeList: []string{},
}
if len(nodeSelectors) > 0 {
for _, nodeSelectorLabel := range nodeSelectors {
nodeSelectorLabelLen := len(nodeSelectorLabel)
if nodeSelectorLabelLen <= 0 {
continue
}
// check input
index := strings.Index(nodeSelectorLabel, ":")
if index < 0 || index >= (nodeSelectorLabelLen-1) {
continue
}
nodeSelectorLabelName := strings.TrimSpace(nodeSelectorLabel[:index])
nodeSelectorLabelValue := strings.TrimSpace(nodeSelectorLabel[index+1:])
key := nodeSelectorLabelName + ":" + nodeSelectorLabelValue
sc.nodeSelectorLabels[key] = ""
}
}
// Prepare event clients.
broadcaster := record.NewBroadcaster()
broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: eventClient.CoreV1().Events("")})
sc.Recorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: schedulerName})
sc.BindFlowChannel = make(chan *schedulingapi.TaskInfo, 5000)
sc.Binder = GetBindMethod()
var batchNum int
batchNum, err = strconv.Atoi(os.Getenv("BATCH_BIND_NUM"))
if err == nil && batchNum > 0 {
sc.batchNum = batchNum
} else {
sc.batchNum = 1
}
sc.Evictor = &defaultEvictor{
kubeclient: sc.kubeClient,
recorder: sc.Recorder,
}
sc.StatusUpdater = &defaultStatusUpdater{
kubeclient: sc.kubeClient,
vcclient: sc.vcClient,
}
sc.PodGroupBinder = &podgroupBinder{
kubeclient: sc.kubeClient,
vcclient: sc.vcClient,
}
informerFactory := informers.NewSharedInformerFactory(sc.kubeClient, 0)
sc.informerFactory = informerFactory
mySchedulerPodName, c := getMultiSchedulerInfo()
// create informer for node information
sc.nodeInformer = informerFactory.Core().V1().Nodes()
sc.nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
node, ok := obj.(*v1.Node)
if !ok {
klog.Errorf("Cannot convert to *v1.Node: %v", obj)
return false
}
if !responsibleForNode(node.Name, mySchedulerPodName, c) {
return false
}
if len(sc.nodeSelectorLabels) == 0 {
return true
}
for labelName, labelValue := range node.Labels {
key := labelName + ":" + labelValue
if _, ok := sc.nodeSelectorLabels[key]; ok {
return true
}
}
klog.Infof("node %s ignore add/update/delete into schedulerCache", node.Name)
return false
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddNode,
UpdateFunc: sc.UpdateNode,
DeleteFunc: sc.DeleteNode,
},
},
0,
)
sc.podInformer = informerFactory.Core().V1().Pods()
sc.pvcInformer = informerFactory.Core().V1().PersistentVolumeClaims()
sc.pvInformer = informerFactory.Core().V1().PersistentVolumes()
sc.scInformer = informerFactory.Storage().V1().StorageClasses()
sc.csiNodeInformer = informerFactory.Storage().V1().CSINodes()
sc.csiDriverInformer = informerFactory.Storage().V1().CSIDrivers()
sc.csiStorageCapacityInformer = informerFactory.Storage().V1alpha1().CSIStorageCapacities()
var capacityCheck *volumescheduling.CapacityCheck
if options.ServerOpts.EnableCSIStorage {
capacityCheck = &volumescheduling.CapacityCheck{
CSIDriverInformer: sc.csiDriverInformer,
CSIStorageCapacityInformer: sc.csiStorageCapacityInformer,
}
} else {
capacityCheck = nil
}
sc.VolumeBinder = &defaultVolumeBinder{
volumeBinder: volumescheduling.NewVolumeBinder(
sc.kubeClient,
sc.podInformer,
sc.nodeInformer,
sc.csiNodeInformer,
sc.pvcInformer,
sc.pvInformer,
sc.scInformer,
capacityCheck,
30*time.Second,
),
}
// create informer for pod information
sc.podInformer.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch v := obj.(type) {
case *v1.Pod:
if !responsibleForPod(v, schedulerName, mySchedulerPodName, c) {
if len(v.Spec.NodeName) == 0 {
return false
}
if !responsibleForNode(v.Spec.NodeName, mySchedulerPodName, c) {
return false
}
}
return true
default:
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddPod,
UpdateFunc: sc.UpdatePod,
DeleteFunc: sc.DeletePod,
},
})
if options.ServerOpts.EnablePriorityClass {
sc.pcInformer = informerFactory.Scheduling().V1().PriorityClasses()
sc.pcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddPriorityClass,
UpdateFunc: sc.UpdatePriorityClass,
DeleteFunc: sc.DeletePriorityClass,
})
}
sc.quotaInformer = informerFactory.Core().V1().ResourceQuotas()
sc.quotaInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddResourceQuota,
UpdateFunc: sc.UpdateResourceQuota,
DeleteFunc: sc.DeleteResourceQuota,
})
vcinformers := vcinformer.NewSharedInformerFactory(sc.vcClient, 0)
sc.vcInformerFactory = vcinformers
// create informer for PodGroup(v1beta1) information
sc.podGroupInformerV1beta1 = vcinformers.Scheduling().V1beta1().PodGroups()
sc.podGroupInformerV1beta1.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch v := obj.(type) {
case *vcv1beta1.PodGroup:
return responsibleForPodGroup(v, mySchedulerPodName, c)
default:
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddPodGroupV1beta1,
UpdateFunc: sc.UpdatePodGroupV1beta1,
DeleteFunc: sc.DeletePodGroupV1beta1,
},
})
// create informer(v1beta1) for Queue information
sc.queueInformerV1beta1 = vcinformers.Scheduling().V1beta1().Queues()
sc.queueInformerV1beta1.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddQueueV1beta1,
UpdateFunc: sc.UpdateQueueV1beta1,
DeleteFunc: sc.DeleteQueueV1beta1,
})
sc.cpuInformer = vcinformers.Nodeinfo().V1alpha1().Numatopologies()
sc.cpuInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: sc.AddNumaInfoV1alpha1,
UpdateFunc: sc.UpdateNumaInfoV1alpha1,
DeleteFunc: sc.DeleteNumaInfoV1alpha1,
})
return sc
}
// Run starts the schedulerCache
func (sc *SchedulerCache) Run(stopCh <-chan struct{}) {
sc.informerFactory.Start(stopCh)
sc.vcInformerFactory.Start(stopCh)
// Re-sync error tasks.
go wait.Until(sc.processResyncTask, 0, stopCh)
// Cleanup jobs.
go wait.Until(sc.processCleanupJob, 0, stopCh)
go wait.Until(sc.processBindTask, time.Millisecond*20, stopCh)
}
// WaitForCacheSync sync the cache with the api server
func (sc *SchedulerCache) WaitForCacheSync(stopCh <-chan struct{}) {
sc.informerFactory.WaitForCacheSync(stopCh)
sc.vcInformerFactory.WaitForCacheSync(stopCh)
}
// findJobAndTask returns job and the task info
func (sc *SchedulerCache) findJobAndTask(taskInfo *schedulingapi.TaskInfo) (*schedulingapi.JobInfo, *schedulingapi.TaskInfo, error) {
job, found := sc.Jobs[taskInfo.Job]
if !found {
return nil, nil, fmt.Errorf("failed to find Job %v for Task %v",
taskInfo.Job, taskInfo.UID)
}
task, found := job.Tasks[taskInfo.UID]
if !found {
return nil, nil, fmt.Errorf("failed to find task in status %v by id %v",
taskInfo.Status, taskInfo.UID)
}
return job, task, nil
}
// Evict will evict the pod.
//
// If error occurs both task and job are guaranteed to be in the original state.
func (sc *SchedulerCache) Evict(taskInfo *schedulingapi.TaskInfo, reason string) error {
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
job, task, err := sc.findJobAndTask(taskInfo)
if err != nil {
return err
}
node, found := sc.Nodes[task.NodeName]
if !found {
return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
task.UID, task.NodeName)
}
originalStatus := task.Status
if err := job.UpdateTaskStatus(task, schedulingapi.Releasing); err != nil {
return err
}
// Add new task to node.
if err := node.UpdateTask(task); err != nil {
// After failing to update task to a node we need to revert task status from Releasing,
// otherwise task might be stuck in the Releasing state indefinitely.
if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
"from %s to %s after failing to update Task on Node <%s>: %v",
task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
sc.resyncTask(task)
}
return err
}
p := task.Pod
go func() {
err := sc.Evictor.Evict(p, reason)
if err != nil {
sc.resyncTask(task)
}
}()
podgroup := &vcv1beta1.PodGroup{}
if err := schedulingscheme.Scheme.Convert(&job.PodGroup.PodGroup, podgroup, nil); err != nil {
klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
return err
}
sc.Recorder.Eventf(podgroup, v1.EventTypeNormal, "Evict", reason)
return nil
}
// Bind binds task to the target host.
func (sc *SchedulerCache) Bind(tasks []*schedulingapi.TaskInfo) error {
go func(taskArray []*schedulingapi.TaskInfo) {
tmp := time.Now()
err, errTasks := sc.Binder.Bind(sc.kubeClient, taskArray)
if err == nil {
klog.V(3).Infof("bind ok, latency %v", time.Since(tmp))
for _, task := range tasks {
sc.Recorder.Eventf(task.Pod, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v",
task.Namespace, task.Name, task.NodeName)
}
} else {
for _, task := range errTasks {
klog.V(2).Infof("resyncTask task %s", task.Name)
sc.resyncTask(task)
}
}
}(tasks)
return nil
}
// BindPodGroup binds job to silo cluster
func (sc *SchedulerCache) BindPodGroup(job *schedulingapi.JobInfo, cluster string) error {
if _, err := sc.PodGroupBinder.Bind(job, cluster); err != nil {
klog.Errorf("Bind job <%s> to cluster <%s> failed: %v", job.Name, cluster, err)
return err
}
return nil
}
// GetPodVolumes get pod volume on the host
func (sc *SchedulerCache) GetPodVolumes(task *schedulingapi.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
return sc.VolumeBinder.GetPodVolumes(task, node)
}
// AllocateVolumes allocates volume on the host to the task
func (sc *SchedulerCache) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
return sc.VolumeBinder.AllocateVolumes(task, hostname, podVolumes)
}
// BindVolumes binds volumes to the task
func (sc *SchedulerCache) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
return sc.VolumeBinder.BindVolumes(task, podVolumes)
}
// Client returns the kubernetes clientSet
func (sc *SchedulerCache) Client() kubernetes.Interface {
return sc.kubeClient
}
// SharedInformerFactory returns the scheduler SharedInformerFactory
func (sc *SchedulerCache) SharedInformerFactory() informers.SharedInformerFactory {
return sc.informerFactory
}
// UpdateSchedulerNumaInfo used to update scheduler node cache NumaSchedulerInfo
func (sc *SchedulerCache) UpdateSchedulerNumaInfo(AllocatedSets map[string]schedulingapi.ResNumaSets) error {
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
for nodeName, sets := range AllocatedSets {
if _, found := sc.Nodes[nodeName]; !found {
continue
}
numaInfo := sc.Nodes[nodeName].NumaSchedulerInfo
if numaInfo == nil {
continue
}
numaInfo.Allocate(sets)
}
return nil
}
// taskUnschedulable updates pod status of pending task
func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason, message string) error {
pod := task.Pod
condition := &v1.PodCondition{
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: reason, // Add more reasons in order to distinguish more specific scenario of pending tasks
Message: message,
}
if podConditionHaveUpdate(&pod.Status, condition) {
pod = pod.DeepCopy()
// The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
// k8s core, so using the same string here.
// The reason field in PodCondition can be "Unschedulable"
sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil {
return err
}
} else {
klog.V(4).Infof("task unscheduleable %s/%s, message: %s, skip by no condition update", pod.Namespace, pod.Name, message)
}
return nil
}
func (sc *SchedulerCache) deleteJob(job *schedulingapi.JobInfo) {
klog.V(3).Infof("Try to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
sc.deletedJobs.AddRateLimited(job)
}
func (sc *SchedulerCache) processCleanupJob() {
obj, shutdown := sc.deletedJobs.Get()
if shutdown {
return
}
defer sc.deletedJobs.Done(obj)
job, found := obj.(*schedulingapi.JobInfo)
if !found {
klog.Errorf("Failed to convert <%v> to *JobInfo", obj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
if schedulingapi.JobTerminated(job) {
delete(sc.Jobs, job.UID)
klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name)
} else {
// Retry
sc.deleteJob(job)
}
}
func (sc *SchedulerCache) resyncTask(task *schedulingapi.TaskInfo) {
sc.errTasks.AddRateLimited(task)
}
func (sc *SchedulerCache) processResyncTask() {
obj, shutdown := sc.errTasks.Get()
if shutdown {
return
}
defer sc.errTasks.Done(obj)
task, ok := obj.(*schedulingapi.TaskInfo)
if !ok {
klog.Errorf("failed to convert %v to *schedulingapi.TaskInfo", obj)
return
}
if err := sc.syncTask(task); err != nil {
klog.Errorf("Failed to sync pod <%v/%v>, retry it.", task.Namespace, task.Name)
sc.resyncTask(task)
}
}
func (sc *SchedulerCache) AddBindTask(taskInfo *schedulingapi.TaskInfo) error {
klog.V(5).Infof("add bind task %v/%v", taskInfo.Namespace, taskInfo.Name)
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
job, task, err := sc.findJobAndTask(taskInfo)
if err != nil {
return err
}
node, found := sc.Nodes[taskInfo.NodeName]
if !found {
return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
task.UID, taskInfo.NodeName)
}
originalStatus := task.Status
if err := job.UpdateTaskStatus(task, schedulingapi.Binding); err != nil {
return err
}
// Add task to the node.
if err := node.AddTask(task); err != nil {
// After failing to update task to a node we need to revert task status from Releasing,
// otherwise task might be stuck in the Releasing state indefinitely.
if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
"from %s to %s after failing to update Task on Node <%s>: %v",
task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
sc.resyncTask(task)
}
return err
}
sc.BindFlowChannel <- taskInfo
return nil
}
func (sc *SchedulerCache) processBindTask() {
for {
select {
case taskInfo, ok := <-sc.BindFlowChannel:
if !ok {
return
}
sc.bindCache = append(sc.bindCache, taskInfo)
if len(sc.bindCache) == sc.batchNum {
sc.BindTask()
}
}
if len(sc.BindFlowChannel) == 0 {
break
}
}
if len(sc.bindCache) == 0 {
return
}
sc.BindTask()
}
func (sc *SchedulerCache) BindTask() {
klog.V(5).Infof("batch bind task count %d", len(sc.bindCache))
for _, task := range sc.bindCache {
if err := sc.BindVolumes(task, task.PodVolumes); err != nil {
klog.Errorf("task %s/%s bind Volumes failed: %#v", task.Namespace, task.Name, err)
sc.resyncTask(task)
return
}
}
bindTasks := make([]*schedulingapi.TaskInfo, len(sc.bindCache))
copy(bindTasks, sc.bindCache)
if err := sc.Bind(bindTasks); err != nil {
return
}
for _, task := range sc.bindCache {
metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
}
sc.bindCache = sc.bindCache[0:0]
return
}
// Snapshot returns the complete snapshot of the cluster from cache
func (sc *SchedulerCache) Snapshot() *schedulingapi.ClusterInfo {
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
snapshot := &schedulingapi.ClusterInfo{
Nodes: make(map[string]*schedulingapi.NodeInfo),
Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
NamespaceInfo: make(map[schedulingapi.NamespaceName]*schedulingapi.NamespaceInfo),
RevocableNodes: make(map[string]*schedulingapi.NodeInfo),
NodeList: make([]string, len(sc.NodeList)),
}
copy(snapshot.NodeList, sc.NodeList)
for _, value := range sc.Nodes {
value.RefreshNumaSchedulerInfoByCrd()
}
for _, value := range sc.Nodes {
if !value.Ready() {
continue
}
snapshot.Nodes[value.Name] = value.Clone()
if value.RevocableZone != "" {
snapshot.RevocableNodes[value.Name] = snapshot.Nodes[value.Name]
}
}
for _, value := range sc.Queues {
snapshot.Queues[value.UID] = value.Clone()
}
var cloneJobLock sync.Mutex
var wg sync.WaitGroup
cloneJob := func(value *schedulingapi.JobInfo) {
defer wg.Done()
if value.PodGroup != nil {
value.Priority = sc.defaultPriority
priName := value.PodGroup.Spec.PriorityClassName
if priorityClass, found := sc.PriorityClasses[priName]; found {
value.Priority = priorityClass.Value
}
klog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>",
value.Namespace, value.Name, priName, value.Priority)
}
clonedJob := value.Clone()
cloneJobLock.Lock()
snapshot.Jobs[value.UID] = clonedJob
cloneJobLock.Unlock()
}
for _, value := range sc.NamespaceCollection {
info := value.Snapshot()
snapshot.NamespaceInfo[info.Name] = info
klog.V(4).Infof("Namespace %s has weight %v",
value.Name, info.GetWeight())
}
for _, value := range sc.Jobs {
// If no scheduling spec, does not handle it.
if value.PodGroup == nil {
klog.V(4).Infof("The scheduling spec of Job <%v:%s/%s> is nil, ignore it.",
value.UID, value.Namespace, value.Name)
continue
}
if _, found := snapshot.Queues[value.Queue]; !found {
klog.V(3).Infof("The Queue <%v> of Job <%v/%v> does not exist, ignore it.",
value.Queue, value.Namespace, value.Name)
continue
}
wg.Add(1)
go cloneJob(value)
}
wg.Wait()
klog.V(3).Infof("There are <%d> Jobs, <%d> Queues and <%d> Nodes in total for scheduling.",
len(snapshot.Jobs), len(snapshot.Queues), len(snapshot.Nodes))
return snapshot
}
// String returns information about the cache in a string format
func (sc *SchedulerCache) String() string {
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
str := "Cache:\n"
if len(sc.Nodes) != 0 {
str += "Nodes:\n"
for _, n := range sc.Nodes {
str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
i := 0
for _, p := range n.Tasks {
str += fmt.Sprintf("\t\t %d: %v\n", i, p)
i++
}
}
}
if len(sc.Jobs) != 0 {
str += "Jobs:\n"
for _, job := range sc.Jobs {
str += fmt.Sprintf("\t %s\n", job)
}
}
if len(sc.NamespaceCollection) != 0 {
str += "Namespaces:\n"
for _, ns := range sc.NamespaceCollection {
info := ns.Snapshot()
str += fmt.Sprintf("\t Namespace(%s) Weight(%v)\n",
info.Name, info.Weight)
}
}
if len(sc.NodeList) != 0 {
str += fmt.Sprintf("NodeList: %v\n", sc.NodeList)
}
return str
}
// RecordJobStatusEvent records related events according to job status.
func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo) {
pgUnschedulable := job.PodGroup != nil &&
(job.PodGroup.Status.Phase == scheduling.PodGroupUnknown ||
job.PodGroup.Status.Phase == scheduling.PodGroupPending ||
job.PodGroup.Status.Phase == scheduling.PodGroupInqueue)
// If pending or unschedulable, record unschedulable event.
if pgUnschedulable {
msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v",
len(job.TaskStatusIndex[schedulingapi.Pending]),
len(job.Tasks),
job.FitError())
sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeWarning, string(scheduling.PodGroupUnschedulableType), msg)
} else {
sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupScheduled), string(scheduling.PodGroupReady))
}
baseErrorMessage := job.JobFitErrors
if baseErrorMessage == "" {
baseErrorMessage = schedulingapi.AllNodeUnavailableMsg
}
// Update podCondition for tasks Allocated and Pending before job discarded
for _, status := range []schedulingapi.TaskStatus{schedulingapi.Allocated, schedulingapi.Pending, schedulingapi.Pipelined} {
for _, taskInfo := range job.TaskStatusIndex[status] {
reason, msg := job.TaskSchedulingReason(taskInfo.UID)
if len(msg) == 0 {
msg = baseErrorMessage
}
if err := sc.taskUnschedulable(taskInfo, reason, msg); err != nil {
klog.Errorf("Failed to update unschedulable task status <%s/%s>: %v",
taskInfo.Namespace, taskInfo.Name, err)
}
}
}
}
// UpdateJobStatus update the status of job and its tasks.
func (sc *SchedulerCache) UpdateJobStatus(job *schedulingapi.JobInfo, updatePG bool) (*schedulingapi.JobInfo, error) {
if updatePG {
pg, err := sc.StatusUpdater.UpdatePodGroup(job.PodGroup)
if err != nil {
return nil, err
}
job.PodGroup = pg
}
sc.RecordJobStatusEvent(job)
return job, nil
}
func (sc *SchedulerCache) recordPodGroupEvent(podGroup *schedulingapi.PodGroup, eventType, reason, msg string) {
if podGroup == nil {
return
}
pg := &vcv1beta1.PodGroup{}
if err := schedulingscheme.Scheme.Convert(&podGroup.PodGroup, pg, nil); err != nil {
klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
return
}
sc.Recorder.Eventf(pg, eventType, reason, msg)
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"context"
"fmt"
"strconv"
v1 "k8s.io/api/core/v1"
schedulingv1 "k8s.io/api/scheduling/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/apis/pkg/apis/scheduling/scheme"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/apis/utils"
schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
)
func isTerminated(status schedulingapi.TaskStatus) bool {
return status == schedulingapi.Succeeded || status == schedulingapi.Failed
}
// getOrCreateJob will return corresponding Job for pi if it exists, or it will create a Job and return it if
// pi.Pod.Spec.SchedulerName is same as volcano scheduler's name, otherwise it will return nil.
func (sc *SchedulerCache) getOrCreateJob(pi *schedulingapi.TaskInfo) *schedulingapi.JobInfo {
if len(pi.Job) == 0 {
if pi.Pod.Spec.SchedulerName != sc.schedulerName {
klog.V(4).Infof("Pod %s/%s will not scheduled by %s, skip creating PodGroup and Job for it",
pi.Pod.Namespace, pi.Pod.Name, sc.schedulerName)
}
return nil
}
if _, found := sc.Jobs[pi.Job]; !found {
sc.Jobs[pi.Job] = schedulingapi.NewJobInfo(pi.Job)
}
return sc.Jobs[pi.Job]
}
func (sc *SchedulerCache) addTask(pi *schedulingapi.TaskInfo) error {
if len(pi.NodeName) != 0 {
if _, found := sc.Nodes[pi.NodeName]; !found {
sc.Nodes[pi.NodeName] = schedulingapi.NewNodeInfo(nil)
sc.Nodes[pi.NodeName].Name = pi.NodeName
}
node := sc.Nodes[pi.NodeName]
if !isTerminated(pi.Status) {
if err := node.AddTask(pi); err != nil {
if _, outOfSync := err.(*schedulingapi.AllocateFailError); outOfSync {
node.State = schedulingapi.NodeState{
Phase: schedulingapi.NotReady,
Reason: "OutOfSync",
}
}
return err
}
} else {
klog.V(4).Infof("Pod <%v/%v> is in status %s.", pi.Namespace, pi.Name, pi.Status.String())
}
}
job := sc.getOrCreateJob(pi)
if job != nil {
job.AddTaskInfo(pi)
}
return nil
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) addPod(pod *v1.Pod) error {
pi := schedulingapi.NewTaskInfo(pod)
return sc.addTask(pi)
}
func (sc *SchedulerCache) syncTask(oldTask *schedulingapi.TaskInfo) error {
newPod, err := sc.kubeClient.CoreV1().Pods(oldTask.Namespace).Get(context.TODO(), oldTask.Name, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
err := sc.deleteTask(oldTask)
if err != nil {
klog.Errorf("Failed to delete Pod <%v/%v> and remove from cache: %s", oldTask.Namespace, oldTask.Name, err.Error())
return err
}
klog.V(3).Infof("Pod <%v/%v> was deleted, removed from cache.", oldTask.Namespace, oldTask.Name)
return nil
}
return fmt.Errorf("failed to get Pod <%v/%v>: err %v", oldTask.Namespace, oldTask.Name, err)
}
newTask := schedulingapi.NewTaskInfo(newPod)
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
return sc.updateTask(oldTask, newTask)
}
func (sc *SchedulerCache) updateTask(oldTask, newTask *schedulingapi.TaskInfo) error {
if err := sc.deleteTask(oldTask); err != nil {
klog.Warningf("Failed to delete task: %v", err)
}
return sc.addTask(newTask)
}
// Check the pod allocated status in cache
func (sc *SchedulerCache) allocatedPodInCache(pod *v1.Pod) bool {
pi := schedulingapi.NewTaskInfo(pod)
if job, found := sc.Jobs[pi.Job]; found {
if t, found := job.Tasks[pi.UID]; found {
return schedulingapi.AllocatedStatus(t.Status)
}
}
return false
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) updatePod(oldPod, newPod *v1.Pod) error {
//ignore the update event if pod is allocated in cache but not present in NodeName
if sc.allocatedPodInCache(newPod) && newPod.Spec.NodeName == "" {
klog.V(4).Infof("Pod <%s/%v> already in cache with allocated status, ignore the update event", newPod.Namespace, newPod.Name)
return nil
}
if err := sc.deletePod(oldPod); err != nil {
return err
}
//when delete pod, the ownerreference of pod will be set nil,just as orphan pod
if len(utils.GetController(newPod)) == 0 {
newPod.OwnerReferences = oldPod.OwnerReferences
}
return sc.addPod(newPod)
}
func (sc *SchedulerCache) deleteTask(pi *schedulingapi.TaskInfo) error {
var jobErr, nodeErr, numaErr error
if len(pi.Job) != 0 {
if job, found := sc.Jobs[pi.Job]; found {
jobErr = job.DeleteTaskInfo(pi)
} else {
jobErr = fmt.Errorf("failed to find Job <%v> for Task %v/%v",
pi.Job, pi.Namespace, pi.Name)
}
}
if len(pi.NodeName) != 0 {
node := sc.Nodes[pi.NodeName]
if node != nil {
nodeErr = node.RemoveTask(pi)
}
}
if jobErr != nil || nodeErr != nil {
return schedulingapi.MergeErrors(jobErr, nodeErr, numaErr)
}
return nil
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) deletePod(pod *v1.Pod) error {
pi := schedulingapi.NewTaskInfo(pod)
// Delete the Task in cache to handle Binding status.
task := pi
if job, found := sc.Jobs[pi.Job]; found {
if t, found := job.Tasks[pi.UID]; found {
task = t
}
}
if err := sc.deleteTask(task); err != nil {
klog.Warningf("Failed to delete task: %v", err)
}
// If job was terminated, delete it.
if job, found := sc.Jobs[pi.Job]; found && schedulingapi.JobTerminated(job) {
sc.deleteJob(job)
}
return nil
}
// AddPod add pod to scheduler cache
func (sc *SchedulerCache) AddPod(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
klog.Errorf("Cannot convert to *v1.Pod: %v", obj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
err := sc.addPod(pod)
if err != nil {
klog.Errorf("Failed to add pod <%s/%s> into cache: %v",
pod.Namespace, pod.Name, err)
return
}
klog.V(3).Infof("Added pod <%s/%v> into cache.", pod.Namespace, pod.Name)
}
// UpdatePod update pod to scheduler cache
func (sc *SchedulerCache) UpdatePod(oldObj, newObj interface{}) {
oldPod, ok := oldObj.(*v1.Pod)
if !ok {
klog.Errorf("Cannot convert oldObj to *v1.Pod: %v", oldObj)
return
}
newPod, ok := newObj.(*v1.Pod)
if !ok {
klog.Errorf("Cannot convert newObj to *v1.Pod: %v", newObj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
err := sc.updatePod(oldPod, newPod)
if err != nil {
klog.Errorf("Failed to update pod %v in cache: %v", oldPod.Name, err)
return
}
klog.V(4).Infof("Updated pod <%s/%v> in cache.", oldPod.Namespace, oldPod.Name)
}
// DeletePod delete pod from scheduler cache
func (sc *SchedulerCache) DeletePod(obj interface{}) {
var pod *v1.Pod
switch t := obj.(type) {
case *v1.Pod:
pod = t
case cache.DeletedFinalStateUnknown:
var ok bool
pod, ok = t.Obj.(*v1.Pod)
if !ok {
klog.Errorf("Cannot convert to *v1.Pod: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to *v1.Pod: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
err := sc.deletePod(pod)
if err != nil {
klog.Errorf("Failed to delete pod %v from cache: %v", pod.Name, err)
return
}
klog.V(3).Infof("Deleted pod <%s/%v> from cache.", pod.Namespace, pod.Name)
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) addNode(node *v1.Node) error {
if sc.Nodes[node.Name] != nil {
sc.Nodes[node.Name].SetNode(node)
} else {
sc.Nodes[node.Name] = schedulingapi.NewNodeInfo(node)
}
return nil
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) updateNode(oldNode, newNode *v1.Node) error {
if sc.Nodes[newNode.Name] != nil {
sc.Nodes[newNode.Name].SetNode(newNode)
return nil
}
return fmt.Errorf("node <%s> does not exist", newNode.Name)
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) deleteNode(node *v1.Node) error {
if _, ok := sc.Nodes[node.Name]; !ok {
return fmt.Errorf("node <%s> does not exist", node.Name)
}
numaInfo := sc.Nodes[node.Name].NumaInfo
if numaInfo != nil {
klog.V(3).Infof("delete numatopo <%s/%s>", numaInfo.Namespace, numaInfo.Name)
err := sc.vcClient.NodeinfoV1alpha1().Numatopologies().Delete(context.TODO(), numaInfo.Name, metav1.DeleteOptions{})
if err != nil {
klog.Errorf("delete numatopo <%s/%s> failed.", numaInfo.Namespace, numaInfo.Name)
}
}
delete(sc.Nodes, node.Name)
return nil
}
// AddNode add node to scheduler cache
func (sc *SchedulerCache) AddNode(obj interface{}) {
node, ok := obj.(*v1.Node)
if !ok {
klog.Errorf("Cannot convert to *v1.Node: %v", obj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
err := sc.addNode(node)
if err != nil {
klog.Errorf("Failed to add node %s into cache: %v", node.Name, err)
return
}
sc.NodeList = append(sc.NodeList, node.Name)
}
// UpdateNode update node to scheduler cache
func (sc *SchedulerCache) UpdateNode(oldObj, newObj interface{}) {
oldNode, ok := oldObj.(*v1.Node)
if !ok {
klog.Errorf("Cannot convert oldObj to *v1.Node: %v", oldObj)
return
}
newNode, ok := newObj.(*v1.Node)
if !ok {
klog.Errorf("Cannot convert newObj to *v1.Node: %v", newObj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
err := sc.updateNode(oldNode, newNode)
if err != nil {
klog.Errorf("Failed to update node %v in cache: %v", oldNode.Name, err)
return
}
}
// DeleteNode delete node from scheduler cache
func (sc *SchedulerCache) DeleteNode(obj interface{}) {
var node *v1.Node
switch t := obj.(type) {
case *v1.Node:
node = t
case cache.DeletedFinalStateUnknown:
var ok bool
node, ok = t.Obj.(*v1.Node)
if !ok {
klog.Errorf("Cannot convert to *v1.Node: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to *v1.Node: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
err := sc.deleteNode(node)
if err != nil {
klog.Errorf("Failed to delete node %s from cache: %v", node.Name, err)
return
}
for i, name := range sc.NodeList {
if name == node.Name {
sc.NodeList = append(sc.NodeList[:i], sc.NodeList[i+1:]...)
break
}
}
}
func getJobID(pg *schedulingapi.PodGroup) schedulingapi.JobID {
return schedulingapi.JobID(fmt.Sprintf("%s/%s", pg.Namespace, pg.Name))
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) setPodGroup(ss *schedulingapi.PodGroup) error {
job := getJobID(ss)
if _, found := sc.Jobs[job]; !found {
sc.Jobs[job] = schedulingapi.NewJobInfo(job)
}
sc.Jobs[job].SetPodGroup(ss)
// TODO(k82cn): set default queue in admission.
if len(ss.Spec.Queue) == 0 {
sc.Jobs[job].Queue = schedulingapi.QueueID(sc.defaultQueue)
}
return nil
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) updatePodGroup(newPodGroup *schedulingapi.PodGroup) error {
return sc.setPodGroup(newPodGroup)
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) deletePodGroup(id schedulingapi.JobID) error {
job, found := sc.Jobs[id]
if !found {
return fmt.Errorf("can not found job %v", id)
}
// Unset SchedulingSpec
job.UnsetPodGroup()
sc.deleteJob(job)
return nil
}
// AddPodGroupV1beta1 add podgroup to scheduler cache
func (sc *SchedulerCache) AddPodGroupV1beta1(obj interface{}) {
ss, ok := obj.(*schedulingv1beta1.PodGroup)
if !ok {
klog.Errorf("Cannot convert to *schedulingv1beta1.PodGroup: %v", obj)
return
}
podgroup := scheduling.PodGroup{}
if err := scheme.Scheme.Convert(ss, &podgroup, nil); err != nil {
klog.Errorf("Failed to convert podgroup from %T to %T", ss, podgroup)
return
}
pg := &schedulingapi.PodGroup{PodGroup: podgroup, Version: schedulingapi.PodGroupVersionV1Beta1}
klog.V(4).Infof("Add PodGroup(%s) into cache, spec(%#v)", ss.Name, ss.Spec)
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
if err := sc.setPodGroup(pg); err != nil {
klog.Errorf("Failed to add PodGroup %s into cache: %v", ss.Name, err)
return
}
}
// UpdatePodGroupV1beta1 add podgroup to scheduler cache
func (sc *SchedulerCache) UpdatePodGroupV1beta1(oldObj, newObj interface{}) {
oldSS, ok := oldObj.(*schedulingv1beta1.PodGroup)
if !ok {
klog.Errorf("Cannot convert oldObj to *schedulingv1beta1.SchedulingSpec: %v", oldObj)
return
}
newSS, ok := newObj.(*schedulingv1beta1.PodGroup)
if !ok {
klog.Errorf("Cannot convert newObj to *schedulingv1beta1.SchedulingSpec: %v", newObj)
return
}
if oldSS.ResourceVersion == newSS.ResourceVersion {
return
}
podgroup := scheduling.PodGroup{}
if err := scheme.Scheme.Convert(newSS, &podgroup, nil); err != nil {
klog.Errorf("Failed to convert podgroup from %T to %T", newSS, podgroup)
return
}
pg := &schedulingapi.PodGroup{PodGroup: podgroup, Version: schedulingapi.PodGroupVersionV1Beta1}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
if err := sc.updatePodGroup(pg); err != nil {
klog.Errorf("Failed to update SchedulingSpec %s into cache: %v", pg.Name, err)
return
}
}
// DeletePodGroupV1beta1 delete podgroup from scheduler cache
func (sc *SchedulerCache) DeletePodGroupV1beta1(obj interface{}) {
var ss *schedulingv1beta1.PodGroup
switch t := obj.(type) {
case *schedulingv1beta1.PodGroup:
ss = t
case cache.DeletedFinalStateUnknown:
var ok bool
ss, ok = t.Obj.(*schedulingv1beta1.PodGroup)
if !ok {
klog.Errorf("Cannot convert to podgroup: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to podgroup: %v", t)
return
}
jobID := schedulingapi.JobID(fmt.Sprintf("%s/%s", ss.Namespace, ss.Name))
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
if err := sc.deletePodGroup(jobID); err != nil {
klog.Errorf("Failed to delete podgroup %s from cache: %v", ss.Name, err)
return
}
}
// AddQueueV1beta1 add queue to scheduler cache
func (sc *SchedulerCache) AddQueueV1beta1(obj interface{}) {
ss, ok := obj.(*schedulingv1beta1.Queue)
if !ok {
klog.Errorf("Cannot convert to *schedulingv1beta1.Queue: %v", obj)
return
}
queue := &scheduling.Queue{}
if err := scheme.Scheme.Convert(ss, queue, nil); err != nil {
klog.Errorf("Failed to convert queue from %T to %T", ss, queue)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
klog.V(4).Infof("Add Queue(%s) into cache, spec(%#v)", ss.Name, ss.Spec)
sc.addQueue(queue)
}
// UpdateQueueV1beta1 update queue to scheduler cache
func (sc *SchedulerCache) UpdateQueueV1beta1(oldObj, newObj interface{}) {
oldSS, ok := oldObj.(*schedulingv1beta1.Queue)
if !ok {
klog.Errorf("Cannot convert oldObj to *schedulingv1beta1.Queue: %v", oldObj)
return
}
newSS, ok := newObj.(*schedulingv1beta1.Queue)
if !ok {
klog.Errorf("Cannot convert newObj to *schedulingv1beta1.Queue: %v", newObj)
return
}
if oldSS.ResourceVersion == newSS.ResourceVersion {
return
}
newQueue := &scheduling.Queue{}
if err := scheme.Scheme.Convert(newSS, newQueue, nil); err != nil {
klog.Errorf("Failed to convert queue from %T to %T", newSS, newQueue)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.updateQueue(newQueue)
}
// DeleteQueueV1beta1 delete queue from the scheduler cache
func (sc *SchedulerCache) DeleteQueueV1beta1(obj interface{}) {
var ss *schedulingv1beta1.Queue
switch t := obj.(type) {
case *schedulingv1beta1.Queue:
ss = t
case cache.DeletedFinalStateUnknown:
var ok bool
ss, ok = t.Obj.(*schedulingv1beta1.Queue)
if !ok {
klog.Errorf("Cannot convert to *schedulingv1beta1.Queue: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to *schedulingv1beta1.Queue: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.deleteQueue(schedulingapi.QueueID(ss.Name))
}
func (sc *SchedulerCache) addQueue(queue *scheduling.Queue) {
qi := schedulingapi.NewQueueInfo(queue)
sc.Queues[qi.UID] = qi
}
func (sc *SchedulerCache) updateQueue(queue *scheduling.Queue) {
sc.addQueue(queue)
}
func (sc *SchedulerCache) deleteQueue(id schedulingapi.QueueID) {
delete(sc.Queues, id)
}
//DeletePriorityClass delete priorityclass from the scheduler cache
func (sc *SchedulerCache) DeletePriorityClass(obj interface{}) {
var ss *schedulingv1.PriorityClass
switch t := obj.(type) {
case *schedulingv1.PriorityClass:
ss = t
case cache.DeletedFinalStateUnknown:
var ok bool
ss, ok = t.Obj.(*schedulingv1.PriorityClass)
if !ok {
klog.Errorf("Cannot convert to *schedulingv1.PriorityClass: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to *schedulingv1.PriorityClass: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.deletePriorityClass(ss)
}
//UpdatePriorityClass update priorityclass to scheduler cache
func (sc *SchedulerCache) UpdatePriorityClass(oldObj, newObj interface{}) {
oldSS, ok := oldObj.(*schedulingv1.PriorityClass)
if !ok {
klog.Errorf("Cannot convert oldObj to *schedulingv1.PriorityClass: %v", oldObj)
return
}
newSS, ok := newObj.(*schedulingv1.PriorityClass)
if !ok {
klog.Errorf("Cannot convert newObj to *schedulingv1.PriorityClass: %v", newObj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.deletePriorityClass(oldSS)
sc.addPriorityClass(newSS)
}
//AddPriorityClass add priorityclass to scheduler cache
func (sc *SchedulerCache) AddPriorityClass(obj interface{}) {
ss, ok := obj.(*schedulingv1.PriorityClass)
if !ok {
klog.Errorf("Cannot convert to *schedulingv1.PriorityClass: %v", obj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.addPriorityClass(ss)
}
func (sc *SchedulerCache) deletePriorityClass(pc *schedulingv1.PriorityClass) {
if pc.GlobalDefault {
sc.defaultPriorityClass = nil
sc.defaultPriority = 0
}
delete(sc.PriorityClasses, pc.Name)
}
func (sc *SchedulerCache) addPriorityClass(pc *schedulingv1.PriorityClass) {
if pc.GlobalDefault {
if sc.defaultPriorityClass != nil {
klog.Errorf("Updated default priority class from <%s> to <%s> forcefully.",
sc.defaultPriorityClass.Name, pc.Name)
}
sc.defaultPriorityClass = pc
sc.defaultPriority = pc.Value
}
sc.PriorityClasses[pc.Name] = pc
}
func (sc *SchedulerCache) updateResourceQuota(quota *v1.ResourceQuota) {
collection, ok := sc.NamespaceCollection[quota.Namespace]
if !ok {
collection = schedulingapi.NewNamespaceCollection(quota.Namespace)
sc.NamespaceCollection[quota.Namespace] = collection
}
collection.Update(quota)
}
func (sc *SchedulerCache) deleteResourceQuota(quota *v1.ResourceQuota) {
collection, ok := sc.NamespaceCollection[quota.Namespace]
if !ok {
return
}
collection.Delete(quota)
}
// DeleteResourceQuota delete ResourceQuota from the scheduler cache
func (sc *SchedulerCache) DeleteResourceQuota(obj interface{}) {
var r *v1.ResourceQuota
switch t := obj.(type) {
case *v1.ResourceQuota:
r = t
case cache.DeletedFinalStateUnknown:
var ok bool
r, ok = t.Obj.(*v1.ResourceQuota)
if !ok {
klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
klog.V(3).Infof("Delete ResourceQuota <%s/%v> in cache", r.Namespace, r.Name)
sc.deleteResourceQuota(r)
}
// UpdateResourceQuota update ResourceQuota to scheduler cache
func (sc *SchedulerCache) UpdateResourceQuota(oldObj, newObj interface{}) {
newR, ok := newObj.(*v1.ResourceQuota)
if !ok {
klog.Errorf("Cannot convert newObj to *v1.ResourceQuota: %v", newObj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
klog.V(3).Infof("Update ResourceQuota <%s/%v> in cache, with spec: %v.", newR.Namespace, newR.Name, newR.Spec.Hard)
sc.updateResourceQuota(newR)
}
// AddResourceQuota add ResourceQuota to scheduler cache
func (sc *SchedulerCache) AddResourceQuota(obj interface{}) {
var r *v1.ResourceQuota
switch t := obj.(type) {
case *v1.ResourceQuota:
r = t
default:
klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
klog.V(3).Infof("Add ResourceQuota <%s/%v> in cache, with spec: %v.", r.Namespace, r.Name, r.Spec.Hard)
sc.updateResourceQuota(r)
}
func getNumaInfo(srcInfo *nodeinfov1alpha1.Numatopology) *schedulingapi.NumatopoInfo {
numaInfo := &schedulingapi.NumatopoInfo{
Namespace: srcInfo.Namespace,
Name: srcInfo.Name,
Policies: make(map[nodeinfov1alpha1.PolicyName]string),
NumaResMap: make(map[string]*schedulingapi.ResourceInfo),
CPUDetail: topology.CPUDetails{},
ResReserved: make(v1.ResourceList),
}
policies := srcInfo.Spec.Policies
for name, policy := range policies {
numaInfo.Policies[name] = policy
}
numaResMap := srcInfo.Spec.NumaResMap
for name, resInfo := range numaResMap {
tmp := schedulingapi.ResourceInfo{}
tmp.Capacity = resInfo.Capacity
tmp.Allocatable = cpuset.MustParse(resInfo.Allocatable)
numaInfo.NumaResMap[name] = &tmp
}
cpuDetail := srcInfo.Spec.CPUDetail
for key, detail := range cpuDetail {
cpuID, _ := strconv.Atoi(key)
numaInfo.CPUDetail[cpuID] = topology.CPUInfo{
NUMANodeID: detail.NUMANodeID,
SocketID: detail.SocketID,
CoreID: detail.CoreID,
}
}
resReserved, err := schedulingapi.ParseResourceList(srcInfo.Spec.ResReserved)
if err != nil {
klog.Errorf("ParseResourceList failed, err=%v", err)
} else {
numaInfo.ResReserved = resReserved
}
return numaInfo
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) addNumaInfo(info *nodeinfov1alpha1.Numatopology) error {
if sc.Nodes[info.Name] == nil {
sc.Nodes[info.Name] = schedulingapi.NewNodeInfo(nil)
sc.Nodes[info.Name].Name = info.Name
}
if sc.Nodes[info.Name].NumaInfo == nil {
sc.Nodes[info.Name].NumaInfo = getNumaInfo(info)
sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoMoreFlag
} else {
newLocalInfo := getNumaInfo(info)
if sc.Nodes[info.Name].NumaInfo.Compare(newLocalInfo) {
sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoMoreFlag
} else {
sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoLessFlag
}
sc.Nodes[info.Name].NumaInfo = newLocalInfo
}
for resName, NumaResInfo := range sc.Nodes[info.Name].NumaInfo.NumaResMap {
klog.V(3).Infof("resource %s Allocatable %v on node[%s] into cache", resName, NumaResInfo, info.Name)
}
klog.V(3).Infof("Policies %v on node[%s] into cache, change= %v",
sc.Nodes[info.Name].NumaInfo.Policies, info.Name, sc.Nodes[info.Name].NumaChgFlag)
return nil
}
// Assumes that lock is already acquired.
func (sc *SchedulerCache) deleteNumaInfo(info *nodeinfov1alpha1.Numatopology) {
if sc.Nodes[info.Name] != nil {
sc.Nodes[info.Name].NumaInfo = nil
sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoResetFlag
klog.V(3).Infof("delete numainfo in cahce for node<%s>", info.Name)
}
}
// AddNumaInfoV1alpha1 add numa information to scheduler cache
func (sc *SchedulerCache) AddNumaInfoV1alpha1(obj interface{}) {
ss, ok := obj.(*nodeinfov1alpha1.Numatopology)
if !ok {
klog.Errorf("Cannot convert oldObj to *nodeinfov1alpha1.Numatopology: %v", obj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.addNumaInfo(ss)
}
// UpdateNumaInfoV1alpha1 update numa information to scheduler cache
func (sc *SchedulerCache) UpdateNumaInfoV1alpha1(oldObj, newObj interface{}) {
ss, ok := newObj.(*nodeinfov1alpha1.Numatopology)
if !ok {
klog.Errorf("Cannot convert oldObj to *nodeinfov1alpha1.Numatopology: %v", newObj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.addNumaInfo(ss)
klog.V(3).Infof("update numaInfo<%s> in cahce, with spec: Policy: %v, resMap: %v", ss.Name, ss.Spec.Policies, ss.Spec.NumaResMap)
}
// DeleteNumaInfoV1alpha1 delete numa information from scheduler cache
func (sc *SchedulerCache) DeleteNumaInfoV1alpha1(obj interface{}) {
var ss *nodeinfov1alpha1.Numatopology
switch t := obj.(type) {
case *nodeinfov1alpha1.Numatopology:
ss = t
case cache.DeletedFinalStateUnknown:
var ok bool
ss, ok = t.Obj.(*nodeinfov1alpha1.Numatopology)
if !ok {
klog.Errorf("Cannot convert to Numatopo: %v", t.Obj)
return
}
default:
klog.Errorf("Cannot convert to Numatopo: %v", t)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.deleteNumaInfo(ss)
klog.V(3).Infof("Delete numaInfo<%s> from cahce, with spec: Policy: %v, resMap: %v", ss.Name, ss.Spec.Policies, ss.Spec.NumaResMap)
}
// AddJob add job to scheduler cache
func (sc *SchedulerCache) AddJob(obj interface{}) {
job, ok := obj.(*schedulingapi.JobInfo)
if !ok {
klog.Errorf("Cannot convert to *api.JobInfo: %v", obj)
return
}
sc.Mutex.Lock()
defer sc.Mutex.Unlock()
sc.Jobs[job.UID] = job
}
package cache
// bindMethodMap Binder management
var bindMethodMap Binder
// RegisterBindMethod register Bind Method
func RegisterBindMethod(binder Binder) {
bindMethodMap = binder
}
func GetBindMethod() Binder {
return bindMethodMap
}
func init() {
RegisterBindMethod(NewBinder())
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"fmt"
"os"
"strconv"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"stathat.com/c/consistent"
scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
// responsibleForPod returns false at following conditions:
// 1. The current scheduler is not specified scheduler in Pod's spec.
// 2. The Job which the Pod belongs is not assigned to current scheduler based on the hash algorithm in multi-schedulers scenario
func responsibleForPod(pod *v1.Pod, schedulerName string, mySchedulerPodName string, c *consistent.Consistent) bool {
if schedulerName != pod.Spec.SchedulerName {
return false
}
if c != nil {
var key string
if len(pod.OwnerReferences) != 0 {
key = pod.OwnerReferences[0].Name
} else {
key = pod.Name
}
schedulerPodName, err := c.Get(key)
if err != nil {
klog.Errorf("Failed to get scheduler by hash algorithm, err: %v", err)
}
if schedulerPodName != mySchedulerPodName {
return false
}
}
klog.V(4).Infof("schedulerPodName %v is responsible to Pod %v/%v", mySchedulerPodName, pod.Namespace, pod.Name)
return true
}
// responsibleForNode returns true if the Node is assigned to current scheduler in multi-scheduler scenario
func responsibleForNode(nodeName string, mySchedulerPodName string, c *consistent.Consistent) bool {
if c != nil {
schedulerPodName, err := c.Get(nodeName)
if err != nil {
klog.Errorf("Failed to get scheduler by hash algorithm, err: %v", err)
}
if schedulerPodName != mySchedulerPodName {
return false
}
}
klog.V(4).Infof("schedulerPodName %v is responsible to Node %v", mySchedulerPodName, nodeName)
return true
}
// responsibleForPodGroup returns true if Job which PodGroup belongs is assigned to current scheduler in multi-schedulers scenario
func responsibleForPodGroup(pg *scheduling.PodGroup, mySchedulerPodName string, c *consistent.Consistent) bool {
if c != nil {
var key string
if len(pg.OwnerReferences) != 0 {
key = pg.OwnerReferences[0].Name
} else {
key = pg.Name
}
schedulerPodName, err := c.Get(key)
if err != nil {
klog.Errorf("Failed to get scheduler by hash algorithm, err: %v", err)
}
if schedulerPodName != mySchedulerPodName {
return false
}
}
klog.V(4).Infof("schedulerPodName %v is responsible to PodGroup %v/%v", mySchedulerPodName, pg.Namespace, pg.Name)
return true
}
// getMultiSchedulerInfo return the Pod name of current scheduler and the hash table for all schedulers
func getMultiSchedulerInfo() (schedulerPodName string, c *consistent.Consistent) {
multiSchedulerEnable := os.Getenv("MULTI_SCHEDULER_ENABLE")
mySchedulerPodName := os.Getenv("SCHEDULER_POD_NAME")
c = nil
if multiSchedulerEnable == "true" {
klog.V(3).Infof("multiSchedulerEnable true")
schedulerNumStr := os.Getenv("SCHEDULER_NUM")
schedulerNum, err := strconv.Atoi(schedulerNumStr)
if err != nil {
schedulerNum = 1
}
index := strings.LastIndex(mySchedulerPodName, "-")
baseName := mySchedulerPodName[0:index]
c = consistent.New()
for i := 0; i < schedulerNum; i++ {
name := fmt.Sprintf("%s-%d", baseName, i)
c.Add(name)
}
}
return mySchedulerPodName, c
}
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"strconv"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/conf"
)
// Arguments map
type Arguments map[string]string
// GetInt get the integer value from string
func (a Arguments) GetInt(ptr *int, key string) {
if ptr == nil {
return
}
argv, ok := a[key]
if !ok || argv == "" {
return
}
value, err := strconv.Atoi(argv)
if err != nil {
klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
return
}
*ptr = value
}
// GetFloat64 get the float64 value from string
func (a Arguments) GetFloat64(ptr *float64, key string) {
if ptr == nil {
return
}
argv, ok := a[key]
if !ok || len(argv) == 0 {
return
}
value, err := strconv.ParseFloat(argv, 64)
if err != nil {
klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
return
}
*ptr = value
}
// GetBool get the bool value from string
func (a Arguments) GetBool(ptr *bool, key string) {
if ptr == nil {
return
}
argv, ok := a[key]
if !ok || argv == "" {
return
}
value, err := strconv.ParseBool(argv)
if err != nil {
klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
return
}
*ptr = value
}
// GetArgOfActionFromConf return argument of action reading from configuration of schedule
func GetArgOfActionFromConf(configurations []conf.Configuration, actionName string) Arguments {
for _, c := range configurations {
if c.Name == actionName {
return c.Arguments
}
}
return nil
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"time"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/cache"
"volcano.sh/volcano/pkg/scheduler/conf"
"volcano.sh/volcano/pkg/scheduler/metrics"
)
// OpenSession start the session
func OpenSession(cache cache.Cache, tiers []conf.Tier, configurations []conf.Configuration) *Session {
ssn := openSession(cache)
ssn.Tiers = tiers
ssn.Configurations = configurations
for _, tier := range tiers {
for _, plugin := range tier.Plugins {
if pb, found := GetPluginBuilder(plugin.Name); !found {
klog.Errorf("Failed to get plugin %s.", plugin.Name)
} else {
plugin := pb(plugin.Arguments)
ssn.plugins[plugin.Name()] = plugin
onSessionOpenStart := time.Now()
plugin.OnSessionOpen(ssn)
metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionOpen, metrics.Duration(onSessionOpenStart))
}
}
}
return ssn
}
// CloseSession close the session
func CloseSession(ssn *Session) {
for _, plugin := range ssn.plugins {
onSessionCloseStart := time.Now()
plugin.OnSessionClose(ssn)
metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionClose, metrics.Duration(onSessionCloseStart))
}
closeSession(ssn)
}
package framework
import (
"context"
"math/rand"
"reflect"
"time"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/volcano/pkg/scheduler/api"
)
const (
jobUpdaterWorker = 16
jobConditionUpdateTime = time.Minute
jobConditionUpdateTimeJitter = 30 * time.Second
)
// TimeJitterAfter means: new after old + duration + jitter
func TimeJitterAfter(new, old time.Time, duration, maxJitter time.Duration) bool {
var jitter int64
if maxJitter > 0 {
jitter = rand.Int63n(int64(maxJitter))
}
return new.After(old.Add(duration + time.Duration(jitter)))
}
type jobUpdater struct {
ssn *Session
jobQueue []*api.JobInfo
}
func newJobUpdater(ssn *Session) *jobUpdater {
queue := make([]*api.JobInfo, 0, len(ssn.Jobs))
for _, job := range ssn.Jobs {
queue = append(queue, job)
}
ju := &jobUpdater{
ssn: ssn,
jobQueue: queue,
}
return ju
}
func (ju *jobUpdater) UpdateAll() {
workqueue.ParallelizeUntil(context.TODO(), jobUpdaterWorker, len(ju.jobQueue), ju.updateJob)
}
func isPodGroupConditionsUpdated(newCondition, oldCondition []scheduling.PodGroupCondition) bool {
if len(newCondition) != len(oldCondition) {
return true
}
for index, newCond := range newCondition {
oldCond := oldCondition[index]
newTime := newCond.LastTransitionTime
oldTime := oldCond.LastTransitionTime
if TimeJitterAfter(newTime.Time, oldTime.Time, jobConditionUpdateTime, jobConditionUpdateTimeJitter) {
return true
}
// if newCond is not new enough, we treat it the same as the old one
newCond.LastTransitionTime = oldTime
// comparing should ignore the TransitionID
newTransitionID := newCond.TransitionID
newCond.TransitionID = oldCond.TransitionID
shouldUpdate := !reflect.DeepEqual(&newCond, &oldCond)
newCond.LastTransitionTime = newTime
newCond.TransitionID = newTransitionID
if shouldUpdate {
return true
}
}
return false
}
func isPodGroupStatusUpdated(newStatus, oldStatus scheduling.PodGroupStatus) bool {
newCondition := newStatus.Conditions
newStatus.Conditions = nil
oldCondition := oldStatus.Conditions
oldStatus.Conditions = nil
return !reflect.DeepEqual(newStatus, oldStatus) || isPodGroupConditionsUpdated(newCondition, oldCondition)
}
// updateJob update specified job
func (ju *jobUpdater) updateJob(index int) {
job := ju.jobQueue[index]
ssn := ju.ssn
job.PodGroup.Status = jobStatus(ssn, job)
oldStatus, found := ssn.podGroupStatus[job.UID]
updatePG := !found || isPodGroupStatusUpdated(job.PodGroup.Status, oldStatus)
if _, err := ssn.cache.UpdateJobStatus(job, updatePG); err != nil {
klog.Errorf("Failed to update job <%s/%s>: %v",
job.Namespace, job.Name, err)
}
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"fmt"
"path/filepath"
"plugin"
"strings"
"sync"
"k8s.io/klog"
)
var pluginMutex sync.Mutex
// PluginBuilder plugin management
type PluginBuilder = func(Arguments) Plugin
// Plugin management
var pluginBuilders = map[string]PluginBuilder{}
// RegisterPluginBuilder register the plugin
func RegisterPluginBuilder(name string, pc PluginBuilder) {
pluginMutex.Lock()
defer pluginMutex.Unlock()
pluginBuilders[name] = pc
}
// CleanupPluginBuilders cleans up all the plugin
func CleanupPluginBuilders() {
pluginMutex.Lock()
defer pluginMutex.Unlock()
pluginBuilders = map[string]PluginBuilder{}
}
// GetPluginBuilder get the pluginbuilder by name
func GetPluginBuilder(name string) (PluginBuilder, bool) {
pluginMutex.Lock()
defer pluginMutex.Unlock()
pb, found := pluginBuilders[name]
return pb, found
}
// LoadCustomPlugins loads custom implement plugins
func LoadCustomPlugins(pluginsDir string) error {
pluginPaths, _ := filepath.Glob(fmt.Sprintf("%s/*.so", pluginsDir))
for _, pluginPath := range pluginPaths {
pluginBuilder, err := loadPluginBuilder(pluginPath)
if err != nil {
return err
}
pluginName := getPluginName(pluginPath)
RegisterPluginBuilder(pluginName, pluginBuilder)
klog.V(4).Infof("Custom plugin %s loaded", pluginName)
}
return nil
}
func getPluginName(pluginPath string) string {
return strings.TrimSuffix(filepath.Base(pluginPath), filepath.Ext(pluginPath))
}
func loadPluginBuilder(pluginPath string) (PluginBuilder, error) {
plug, err := plugin.Open(pluginPath)
if err != nil {
return nil, err
}
symBuilder, err := plug.Lookup("New")
if err != nil {
return nil, err
}
builder, ok := symBuilder.(PluginBuilder)
if !ok {
return nil, fmt.Errorf("unexpected plugin: %s, failed to convert PluginBuilder `New`", pluginPath)
}
return builder, nil
}
// Action management
var actionMap = map[string]Action{}
// RegisterAction register action
func RegisterAction(act Action) {
pluginMutex.Lock()
defer pluginMutex.Unlock()
actionMap[act.Name()] = act
}
// GetAction get the action by name
func GetAction(name string) (Action, bool) {
pluginMutex.Lock()
defer pluginMutex.Unlock()
act, found := actionMap[name]
return act, found
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"fmt"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/klog"
volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/cache"
"volcano.sh/volcano/pkg/scheduler/conf"
"volcano.sh/volcano/pkg/scheduler/metrics"
"volcano.sh/volcano/pkg/scheduler/util"
)
// Session information for the current session
type Session struct {
UID types.UID
kubeClient kubernetes.Interface
cache cache.Cache
informerFactory informers.SharedInformerFactory
TotalResource *api.Resource
// podGroupStatus cache podgroup status during schedule
// This should not be mutated after initiated
podGroupStatus map[api.JobID]scheduling.PodGroupStatus
Jobs map[api.JobID]*api.JobInfo
Nodes map[string]*api.NodeInfo
RevocableNodes map[string]*api.NodeInfo
Queues map[api.QueueID]*api.QueueInfo
NamespaceInfo map[api.NamespaceName]*api.NamespaceInfo
Tiers []conf.Tier
Configurations []conf.Configuration
NodeList []*api.NodeInfo
plugins map[string]Plugin
eventHandlers []*EventHandler
jobOrderFns map[string]api.CompareFn
queueOrderFns map[string]api.CompareFn
taskOrderFns map[string]api.CompareFn
namespaceOrderFns map[string]api.CompareFn
clusterOrderFns map[string]api.CompareFn
predicateFns map[string]api.PredicateFn
bestNodeFns map[string]api.BestNodeFn
nodeOrderFns map[string]api.NodeOrderFn
batchNodeOrderFns map[string]api.BatchNodeOrderFn
nodeMapFns map[string]api.NodeMapFn
nodeReduceFns map[string]api.NodeReduceFn
preemptableFns map[string]api.EvictableFn
reclaimableFns map[string]api.EvictableFn
overusedFns map[string]api.ValidateFn
underUsedFns map[string]api.UnderUsedResourceFn
jobReadyFns map[string]api.ValidateFn
jobPipelinedFns map[string]api.VoteFn
jobValidFns map[string]api.ValidateExFn
jobEnqueueableFns map[string]api.VoteFn
jobEnqueuedFns map[string]api.JobEnqueuedFn
targetJobFns map[string]api.TargetJobFn
reservedNodesFns map[string]api.ReservedNodesFn
victimTasksFns map[string]api.VictimTasksFn
jobStarvingFns map[string]api.ValidateFn
}
func openSession(cache cache.Cache) *Session {
ssn := &Session{
UID: uuid.NewUUID(),
kubeClient: cache.Client(),
cache: cache,
informerFactory: cache.SharedInformerFactory(),
TotalResource: api.EmptyResource(),
podGroupStatus: map[api.JobID]scheduling.PodGroupStatus{},
Jobs: map[api.JobID]*api.JobInfo{},
Nodes: map[string]*api.NodeInfo{},
RevocableNodes: map[string]*api.NodeInfo{},
Queues: map[api.QueueID]*api.QueueInfo{},
plugins: map[string]Plugin{},
jobOrderFns: map[string]api.CompareFn{},
queueOrderFns: map[string]api.CompareFn{},
taskOrderFns: map[string]api.CompareFn{},
namespaceOrderFns: map[string]api.CompareFn{},
clusterOrderFns: map[string]api.CompareFn{},
predicateFns: map[string]api.PredicateFn{},
bestNodeFns: map[string]api.BestNodeFn{},
nodeOrderFns: map[string]api.NodeOrderFn{},
batchNodeOrderFns: map[string]api.BatchNodeOrderFn{},
nodeMapFns: map[string]api.NodeMapFn{},
nodeReduceFns: map[string]api.NodeReduceFn{},
preemptableFns: map[string]api.EvictableFn{},
reclaimableFns: map[string]api.EvictableFn{},
overusedFns: map[string]api.ValidateFn{},
underUsedFns: map[string]api.UnderUsedResourceFn{},
jobReadyFns: map[string]api.ValidateFn{},
jobPipelinedFns: map[string]api.VoteFn{},
jobValidFns: map[string]api.ValidateExFn{},
jobEnqueueableFns: map[string]api.VoteFn{},
jobEnqueuedFns: map[string]api.JobEnqueuedFn{},
targetJobFns: map[string]api.TargetJobFn{},
reservedNodesFns: map[string]api.ReservedNodesFn{},
victimTasksFns: map[string]api.VictimTasksFn{},
jobStarvingFns: map[string]api.ValidateFn{},
}
snapshot := cache.Snapshot()
ssn.Jobs = snapshot.Jobs
for _, job := range ssn.Jobs {
// only conditions will be updated periodically
if job.PodGroup != nil && job.PodGroup.Status.Conditions != nil {
ssn.podGroupStatus[job.UID] = *job.PodGroup.Status.DeepCopy()
}
if vjr := ssn.JobValid(job); vjr != nil {
if !vjr.Pass {
jc := &scheduling.PodGroupCondition{
Type: scheduling.PodGroupUnschedulableType,
Status: v1.ConditionTrue,
LastTransitionTime: metav1.Now(),
TransitionID: string(ssn.UID),
Reason: vjr.Reason,
Message: vjr.Message,
}
if err := ssn.UpdatePodGroupCondition(job, jc); err != nil {
klog.Errorf("Failed to update job condition: %v", err)
}
}
delete(ssn.Jobs, job.UID)
}
}
ssn.NodeList = util.GetNodeList(snapshot.Nodes, snapshot.NodeList)
ssn.Nodes = snapshot.Nodes
ssn.RevocableNodes = snapshot.RevocableNodes
ssn.Queues = snapshot.Queues
ssn.NamespaceInfo = snapshot.NamespaceInfo
// calculate all nodes' resource only once in each schedule cycle, other plugins can clone it when need
for _, n := range ssn.Nodes {
ssn.TotalResource.Add(n.Allocatable)
}
klog.V(3).Infof("Open Session %v with <%d> Job and <%d> Queues",
ssn.UID, len(ssn.Jobs), len(ssn.Queues))
return ssn
}
func closeSession(ssn *Session) {
ju := newJobUpdater(ssn)
ju.UpdateAll()
ssn.Jobs = nil
ssn.Nodes = nil
ssn.RevocableNodes = nil
ssn.plugins = nil
ssn.eventHandlers = nil
ssn.jobOrderFns = nil
ssn.namespaceOrderFns = nil
ssn.queueOrderFns = nil
ssn.clusterOrderFns = nil
ssn.NodeList = nil
ssn.TotalResource = nil
klog.V(3).Infof("Close Session %v", ssn.UID)
}
func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus {
status := jobInfo.PodGroup.Status
unschedulable := false
for _, c := range status.Conditions {
if c.Type == scheduling.PodGroupUnschedulableType &&
c.Status == v1.ConditionTrue &&
c.TransitionID == string(ssn.UID) {
unschedulable = true
break
}
}
// If running tasks && unschedulable, unknown phase
if len(jobInfo.TaskStatusIndex[api.Running]) != 0 && unschedulable {
status.Phase = scheduling.PodGroupUnknown
} else {
allocated := 0
for status, tasks := range jobInfo.TaskStatusIndex {
if api.AllocatedStatus(status) || status == api.Succeeded {
allocated += len(tasks)
}
}
// If there're enough allocated resource, it's running
if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember {
status.Phase = scheduling.PodGroupRunning
} else if jobInfo.PodGroup.Status.Phase != scheduling.PodGroupInqueue {
status.Phase = scheduling.PodGroupPending
}
}
status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running]))
status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed]))
status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded]))
return status
}
// Statement returns new statement object
func (ssn *Session) Statement() *Statement {
return &Statement{
ssn: ssn,
}
}
// Pipeline the task to the node in the session
func (ssn *Session) Pipeline(task *api.TaskInfo, hostname string) error {
// Only update status in session
job, found := ssn.Jobs[task.Job]
if found {
if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Pipelined, ssn.UID, err)
return err
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, ssn.UID)
return fmt.Errorf("failed to find job %s when binding", task.Job)
}
task.NodeName = hostname
if node, found := ssn.Nodes[hostname]; found {
if err := node.AddTask(task); err != nil {
klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
task.Namespace, task.Name, hostname, ssn.UID, err)
return err
}
klog.V(3).Infof("After added Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
} else {
klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
hostname, ssn.UID)
return fmt.Errorf("failed to find node %s", hostname)
}
for _, eh := range ssn.eventHandlers {
if eh.AllocateFunc != nil {
eh.AllocateFunc(&Event{
Task: task,
})
}
}
return nil
}
//Allocate the task to the node in the session
func (ssn *Session) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) error {
podVolumes, err := ssn.cache.GetPodVolumes(task, nodeInfo.Node)
if err != nil {
return err
}
hostname := nodeInfo.Name
if err := ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
return err
}
task.Pod.Spec.NodeName = hostname
task.PodVolumes = podVolumes
// Only update status in session
job, found := ssn.Jobs[task.Job]
if found {
if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Allocated, ssn.UID, err)
return err
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, ssn.UID)
return fmt.Errorf("failed to find job %s", task.Job)
}
task.NodeName = hostname
if node, found := ssn.Nodes[hostname]; found {
if err := node.AddTask(task); err != nil {
klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
task.Namespace, task.Name, hostname, ssn.UID, err)
return err
}
klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
} else {
klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
hostname, ssn.UID)
return fmt.Errorf("failed to find node %s", hostname)
}
// Callbacks
for _, eh := range ssn.eventHandlers {
if eh.AllocateFunc != nil {
eh.AllocateFunc(&Event{
Task: task,
})
}
}
if ssn.JobReady(job) {
for _, task := range job.TaskStatusIndex[api.Allocated] {
if err := ssn.dispatch(task, podVolumes); err != nil {
klog.Errorf("Failed to dispatch task <%v/%v>: %v",
task.Namespace, task.Name, err)
return err
}
}
}
return nil
}
func (ssn *Session) dispatch(task *api.TaskInfo, volumes *volumescheduling.PodVolumes) error {
if err := ssn.cache.AddBindTask(task); err != nil {
return err
}
// Update status in session
if job, found := ssn.Jobs[task.Job]; found {
if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Binding, ssn.UID, err)
return err
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, ssn.UID)
return fmt.Errorf("failed to find job %s", task.Job)
}
metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
return nil
}
//Evict the task in the session
func (ssn *Session) Evict(reclaimee *api.TaskInfo, reason string) error {
if err := ssn.cache.Evict(reclaimee, reason); err != nil {
return err
}
// Update status in session
job, found := ssn.Jobs[reclaimee.Job]
if found {
if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
reclaimee.Namespace, reclaimee.Name, api.Releasing, ssn.UID, err)
return err
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
reclaimee.Job, ssn.UID)
return fmt.Errorf("failed to find job %s", reclaimee.Job)
}
// Update task in node.
if node, found := ssn.Nodes[reclaimee.NodeName]; found {
if err := node.UpdateTask(reclaimee); err != nil {
klog.Errorf("Failed to update task <%v/%v> in Session <%v>: %v",
reclaimee.Namespace, reclaimee.Name, ssn.UID, err)
return err
}
}
for _, eh := range ssn.eventHandlers {
if eh.DeallocateFunc != nil {
eh.DeallocateFunc(&Event{
Task: reclaimee,
})
}
}
return nil
}
// BindPodGroup bind PodGroup to specified cluster
func (ssn *Session) BindPodGroup(job *api.JobInfo, cluster string) error {
return ssn.cache.BindPodGroup(job, cluster)
}
// UpdatePodGroupCondition update job condition accordingly.
func (ssn *Session) UpdatePodGroupCondition(jobInfo *api.JobInfo, cond *scheduling.PodGroupCondition) error {
job, ok := ssn.Jobs[jobInfo.UID]
if !ok {
return fmt.Errorf("failed to find job <%s/%s>", jobInfo.Namespace, jobInfo.Name)
}
index := -1
for i, c := range job.PodGroup.Status.Conditions {
if c.Type == cond.Type {
index = i
break
}
}
// Update condition to the new condition.
if index < 0 {
job.PodGroup.Status.Conditions = append(job.PodGroup.Status.Conditions, *cond)
} else {
job.PodGroup.Status.Conditions[index] = *cond
}
return nil
}
// AddEventHandler add event handlers
func (ssn *Session) AddEventHandler(eh *EventHandler) {
ssn.eventHandlers = append(ssn.eventHandlers, eh)
}
// UpdateSchedulerNumaInfo update SchedulerNumaInfo
func (ssn *Session) UpdateSchedulerNumaInfo(AllocatedSets map[string]api.ResNumaSets) {
ssn.cache.UpdateSchedulerNumaInfo(AllocatedSets)
}
// KubeClient returns the kubernetes client
func (ssn Session) KubeClient() kubernetes.Interface {
return ssn.kubeClient
}
// InformerFactory returns the scheduler ShareInformerFactory
func (ssn Session) InformerFactory() informers.SharedInformerFactory {
return ssn.informerFactory
}
//String return nodes and jobs information in the session
func (ssn Session) String() string {
msg := fmt.Sprintf("Session %v: \n", ssn.UID)
for _, job := range ssn.Jobs {
msg = fmt.Sprintf("%s%v\n", msg, job)
}
for _, node := range ssn.Nodes {
msg = fmt.Sprintf("%s%v\n", msg, node)
}
return msg
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
"volcano.sh/apis/pkg/apis/scheduling"
"volcano.sh/volcano/pkg/controllers/job/helpers"
"volcano.sh/volcano/pkg/scheduler/api"
)
// AddJobOrderFn add job order function
func (ssn *Session) AddJobOrderFn(name string, cf api.CompareFn) {
ssn.jobOrderFns[name] = cf
}
// AddQueueOrderFn add queue order function
func (ssn *Session) AddQueueOrderFn(name string, qf api.CompareFn) {
ssn.queueOrderFns[name] = qf
}
// AddClusterOrderFn add queue order function
func (ssn *Session) AddClusterOrderFn(name string, qf api.CompareFn) {
ssn.clusterOrderFns[name] = qf
}
// AddTaskOrderFn add task order function
func (ssn *Session) AddTaskOrderFn(name string, cf api.CompareFn) {
ssn.taskOrderFns[name] = cf
}
// AddNamespaceOrderFn add namespace order function
func (ssn *Session) AddNamespaceOrderFn(name string, cf api.CompareFn) {
ssn.namespaceOrderFns[name] = cf
}
// AddPreemptableFn add preemptable function
func (ssn *Session) AddPreemptableFn(name string, cf api.EvictableFn) {
ssn.preemptableFns[name] = cf
}
// AddReclaimableFn add Reclaimable function
func (ssn *Session) AddReclaimableFn(name string, rf api.EvictableFn) {
ssn.reclaimableFns[name] = rf
}
// AddJobReadyFn add JobReady function
func (ssn *Session) AddJobReadyFn(name string, vf api.ValidateFn) {
ssn.jobReadyFns[name] = vf
}
// AddJobPipelinedFn add pipelined function
func (ssn *Session) AddJobPipelinedFn(name string, vf api.VoteFn) {
ssn.jobPipelinedFns[name] = vf
}
// AddPredicateFn add Predicate function
func (ssn *Session) AddPredicateFn(name string, pf api.PredicateFn) {
ssn.predicateFns[name] = pf
}
// AddBestNodeFn add BestNode function
func (ssn *Session) AddBestNodeFn(name string, pf api.BestNodeFn) {
ssn.bestNodeFns[name] = pf
}
// AddNodeOrderFn add Node order function
func (ssn *Session) AddNodeOrderFn(name string, pf api.NodeOrderFn) {
ssn.nodeOrderFns[name] = pf
}
// AddBatchNodeOrderFn add Batch Node order function
func (ssn *Session) AddBatchNodeOrderFn(name string, pf api.BatchNodeOrderFn) {
ssn.batchNodeOrderFns[name] = pf
}
// AddNodeMapFn add Node map function
func (ssn *Session) AddNodeMapFn(name string, pf api.NodeMapFn) {
ssn.nodeMapFns[name] = pf
}
// AddNodeReduceFn add Node reduce function
func (ssn *Session) AddNodeReduceFn(name string, pf api.NodeReduceFn) {
ssn.nodeReduceFns[name] = pf
}
// AddOverusedFn add overused function
func (ssn *Session) AddOverusedFn(name string, fn api.ValidateFn) {
ssn.overusedFns[name] = fn
}
// AddUnderusedResourceFn add underused function
func (ssn *Session) AddUnderusedResourceFn(name string, fn api.UnderUsedResourceFn) {
ssn.underUsedFns[name] = fn
}
// AddJobValidFn add jobvalid function
func (ssn *Session) AddJobValidFn(name string, fn api.ValidateExFn) {
ssn.jobValidFns[name] = fn
}
// AddJobEnqueueableFn add jobenqueueable function
func (ssn *Session) AddJobEnqueueableFn(name string, fn api.VoteFn) {
ssn.jobEnqueueableFns[name] = fn
}
// AddJobEnqueuedFn add jobEnqueued function
func (ssn *Session) AddJobEnqueuedFn(name string, fn api.JobEnqueuedFn) {
ssn.jobEnqueuedFns[name] = fn
}
// AddTargetJobFn add targetjob function
func (ssn *Session) AddTargetJobFn(name string, fn api.TargetJobFn) {
ssn.targetJobFns[name] = fn
}
// AddReservedNodesFn add reservedNodesFn function
func (ssn *Session) AddReservedNodesFn(name string, fn api.ReservedNodesFn) {
ssn.reservedNodesFns[name] = fn
}
// AddVictimTasksFns add victimTasksFns function
func (ssn *Session) AddVictimTasksFns(name string, fn api.VictimTasksFn) {
ssn.victimTasksFns[name] = fn
}
// AddJobStarvingFns add jobStarvingFns function
func (ssn *Session) AddJobStarvingFns(name string, fn api.ValidateFn) {
ssn.jobStarvingFns[name] = fn
}
// Reclaimable invoke reclaimable function of the plugins
func (ssn *Session) Reclaimable(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) []*api.TaskInfo {
var victims []*api.TaskInfo
var init bool
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledReclaimable) {
continue
}
rf, found := ssn.reclaimableFns[plugin.Name]
if !found {
continue
}
candidates, abstain := rf(reclaimer, reclaimees)
if abstain == 0 {
continue
}
if len(candidates) == 0 {
victims = nil
break
}
if !init {
victims = candidates
init = true
} else {
var intersection []*api.TaskInfo
// Get intersection of victims and candidates.
for _, v := range victims {
for _, c := range candidates {
if v.UID == c.UID {
intersection = append(intersection, v)
}
}
}
// Update victims to intersection
victims = intersection
}
}
// Plugins in this tier made decision if victims is not nil
if victims != nil {
return victims
}
}
return victims
}
// Preemptable invoke preemptable function of the plugins
func (ssn *Session) Preemptable(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) []*api.TaskInfo {
var victims []*api.TaskInfo
var init bool
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledPreemptable) {
continue
}
pf, found := ssn.preemptableFns[plugin.Name]
if !found {
continue
}
candidates, abstain := pf(preemptor, preemptees)
if abstain == 0 {
continue
}
// intersection will be nil if length is 0, don't need to do any more check
if len(candidates) == 0 {
victims = nil
break
}
if !init {
victims = candidates
init = true
} else {
var intersection []*api.TaskInfo
// Get intersection of victims and candidates.
for _, v := range victims {
for _, c := range candidates {
if v.UID == c.UID {
intersection = append(intersection, v)
}
}
}
// Update victims to intersection
victims = intersection
}
}
// Plugins in this tier made decision if victims is not nil
if victims != nil {
return victims
}
}
return victims
}
// Overused invoke overused function of the plugins
func (ssn *Session) Overused(queue *api.QueueInfo) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
of, found := ssn.overusedFns[plugin.Name]
if !found {
continue
}
if of(queue) {
return true
}
}
}
return false
}
// UnderusedResources invoke underused function of the plugins
// Returns:
// * nil if no `UnderUsedResourceFn` is registered
// * [] if no under-used resources
func (ssn *Session) UnderusedResources(queue *api.QueueInfo) api.ResourceNameList {
if len(ssn.underUsedFns) == 0 {
return nil
}
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
of, found := ssn.underUsedFns[plugin.Name]
if !found {
continue
}
underUsedResourceList := of(queue)
return underUsedResourceList
}
}
return api.ResourceNameList{}
}
// JobReady invoke jobready function of the plugins
func (ssn *Session) JobReady(obj interface{}) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledJobReady) {
continue
}
jrf, found := ssn.jobReadyFns[plugin.Name]
if !found {
continue
}
if !jrf(obj) {
return false
}
}
}
return true
}
// JobPipelined invoke pipelined function of the plugins
// Check if job has get enough resource to run
func (ssn *Session) JobPipelined(obj interface{}) bool {
var hasFound bool
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledJobPipelined) {
continue
}
jrf, found := ssn.jobPipelinedFns[plugin.Name]
if !found {
continue
}
res := jrf(obj)
if res < 0 {
return false
}
if res > 0 {
hasFound = true
}
}
// if plugin exists that votes permit, meanwhile other plugin votes abstention,
// permit job to be pipelined, do not check next tier
if hasFound {
return true
}
}
return true
}
// JobStarving invoke jobStarving function of the plugins
// Check if job still need more resource
func (ssn *Session) JobStarving(obj interface{}) bool {
var hasFound bool
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledJobStarving) {
continue
}
jrf, found := ssn.jobStarvingFns[plugin.Name]
if !found {
continue
}
hasFound = true
if !jrf(obj) {
return false
}
}
// this tier registered function
if hasFound {
return true
}
}
return false
}
// JobValid invoke jobvalid function of the plugins
func (ssn *Session) JobValid(obj interface{}) *api.ValidateResult {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
jrf, found := ssn.jobValidFns[plugin.Name]
if !found {
continue
}
if vr := jrf(obj); vr != nil && !vr.Pass {
return vr
}
}
}
return nil
}
// JobEnqueueable invoke jobEnqueueableFns function of the plugins
func (ssn *Session) JobEnqueueable(obj interface{}) bool {
var hasFound bool
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledJobEnqueued) {
continue
}
fn, found := ssn.jobEnqueueableFns[plugin.Name]
if !found {
continue
}
res := fn(obj)
if res < 0 {
return false
}
if res > 0 {
hasFound = true
}
}
// if plugin exists that votes permit, meanwhile other plugin votes abstention,
// permit job to be enqueueable, do not check next tier
if hasFound {
return true
}
}
return true
}
// JobEnqueued invoke jobEnqueuedFns function of the plugins
func (ssn *Session) JobEnqueued(obj interface{}) {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledJobEnqueued) {
continue
}
fn, found := ssn.jobEnqueuedFns[plugin.Name]
if !found {
continue
}
fn(obj)
}
}
}
// TargetJob invoke targetJobFns function of the plugins
func (ssn *Session) TargetJob(jobs []*api.JobInfo) *api.JobInfo {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledTargetJob) {
continue
}
fn, found := ssn.targetJobFns[plugin.Name]
if !found {
continue
}
return fn(jobs)
}
}
return nil
}
// VictimTasks invoke ReservedNodes function of the plugins
func (ssn *Session) VictimTasks() []*api.TaskInfo {
var victims []*api.TaskInfo
var init bool
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledVictim) {
continue
}
pf, found := ssn.victimTasksFns[plugin.Name]
if !found {
continue
}
candidates := pf()
if !init {
victims = candidates
init = true
} else {
var intersection []*api.TaskInfo
// Get intersection of victims and candidates.
for _, v := range victims {
for _, c := range candidates {
if v.UID == c.UID {
intersection = append(intersection, v)
}
}
}
// Update victims to intersection
victims = intersection
}
}
// Plugins in this tier made decision if victims is not nil
if victims != nil {
return victims
}
}
return victims
}
// ReservedNodes invoke ReservedNodes function of the plugins
func (ssn *Session) ReservedNodes() {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledReservedNodes) {
continue
}
fn, found := ssn.reservedNodesFns[plugin.Name]
if !found {
continue
}
fn()
}
}
}
// JobOrderFn invoke joborder function of the plugins
func (ssn *Session) JobOrderFn(l, r interface{}) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledJobOrder) {
continue
}
jof, found := ssn.jobOrderFns[plugin.Name]
if !found {
continue
}
if j := jof(l, r); j != 0 {
return j < 0
}
}
}
// If no job order funcs, order job by CreationTimestamp first, then by UID.
lv := l.(*api.JobInfo)
rv := r.(*api.JobInfo)
if lv.CreationTimestamp.Equal(&rv.CreationTimestamp) {
return lv.UID < rv.UID
}
return lv.CreationTimestamp.Before(&rv.CreationTimestamp)
}
// NamespaceOrderFn invoke namespaceorder function of the plugins
func (ssn *Session) NamespaceOrderFn(l, r interface{}) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledNamespaceOrder) {
continue
}
nof, found := ssn.namespaceOrderFns[plugin.Name]
if !found {
continue
}
if j := nof(l, r); j != 0 {
return j < 0
}
}
}
// TODO(lminzhw): if all NamespaceOrderFn treat these two namespace as the same,
// we should make the job order have its affect among namespaces.
// or just schedule namespace one by one
lv := l.(api.NamespaceName)
rv := r.(api.NamespaceName)
return lv < rv
}
// ClusterOrderFn invoke ClusterOrderFn function of the plugins
func (ssn *Session) ClusterOrderFn(l, r interface{}) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledClusterOrder) {
continue
}
cof, found := ssn.clusterOrderFns[plugin.Name]
if !found {
continue
}
if j := cof(l, r); j != 0 {
return j < 0
}
}
}
// If no cluster order funcs, order cluster by ClusterID
lv := l.(*scheduling.Cluster)
rv := r.(*scheduling.Cluster)
return lv.Name < rv.Name
}
// QueueOrderFn invoke queueorder function of the plugins
func (ssn *Session) QueueOrderFn(l, r interface{}) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledQueueOrder) {
continue
}
qof, found := ssn.queueOrderFns[plugin.Name]
if !found {
continue
}
if j := qof(l, r); j != 0 {
return j < 0
}
}
}
// If no queue order funcs, order queue by CreationTimestamp first, then by UID.
lv := l.(*api.QueueInfo)
rv := r.(*api.QueueInfo)
if lv.Queue.CreationTimestamp.Equal(&rv.Queue.CreationTimestamp) {
return lv.UID < rv.UID
}
return lv.Queue.CreationTimestamp.Before(&rv.Queue.CreationTimestamp)
}
// TaskCompareFns invoke taskorder function of the plugins
func (ssn *Session) TaskCompareFns(l, r interface{}) int {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledTaskOrder) {
continue
}
tof, found := ssn.taskOrderFns[plugin.Name]
if !found {
continue
}
if j := tof(l, r); j != 0 {
return j
}
}
}
return 0
}
// TaskOrderFn invoke taskorder function of the plugins
func (ssn *Session) TaskOrderFn(l, r interface{}) bool {
if res := ssn.TaskCompareFns(l, r); res != 0 {
return res < 0
}
// If no task order funcs, order task by default func.
lv := l.(*api.TaskInfo)
rv := r.(*api.TaskInfo)
return helpers.CompareTask(lv, rv)
}
// PredicateFn invoke predicate function of the plugins
func (ssn *Session) PredicateFn(task *api.TaskInfo, node *api.NodeInfo) error {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledPredicate) {
continue
}
pfn, found := ssn.predicateFns[plugin.Name]
if !found {
continue
}
err := pfn(task, node)
if err != nil {
return err
}
}
}
return nil
}
// BestNodeFn invoke bestNode function of the plugins
func (ssn *Session) BestNodeFn(task *api.TaskInfo, nodeScores map[float64][]*api.NodeInfo) *api.NodeInfo {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledBestNode) {
continue
}
pfn, found := ssn.bestNodeFns[plugin.Name]
if !found {
continue
}
// Only the first plugin that enables and realizes bestNodeFn is allowed to choose best node for task
if bestNode := pfn(task, nodeScores); bestNode != nil {
return bestNode
}
}
}
return nil
}
// NodeOrderFn invoke node order function of the plugins
func (ssn *Session) NodeOrderFn(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
priorityScore := 0.0
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledNodeOrder) {
continue
}
pfn, found := ssn.nodeOrderFns[plugin.Name]
if !found {
continue
}
score, err := pfn(task, node)
if err != nil {
return 0, err
}
priorityScore += score
}
}
return priorityScore, nil
}
// BatchNodeOrderFn invoke node order function of the plugins
func (ssn *Session) BatchNodeOrderFn(task *api.TaskInfo, nodes []*api.NodeInfo) (map[string]float64, error) {
priorityScore := make(map[string]float64, len(nodes))
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledNodeOrder) {
continue
}
pfn, found := ssn.batchNodeOrderFns[plugin.Name]
if !found {
continue
}
score, err := pfn(task, nodes)
if err != nil {
return nil, err
}
for nodeName, score := range score {
priorityScore[nodeName] += score
}
}
}
return priorityScore, nil
}
func isEnabled(enabled *bool) bool {
return enabled != nil && *enabled
}
// NodeOrderMapFn invoke node order function of the plugins
func (ssn *Session) NodeOrderMapFn(task *api.TaskInfo, node *api.NodeInfo) (map[string]float64, float64, error) {
nodeScoreMap := map[string]float64{}
var priorityScore float64
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledNodeOrder) {
continue
}
if pfn, found := ssn.nodeOrderFns[plugin.Name]; found {
score, err := pfn(task, node)
if err != nil {
return nodeScoreMap, priorityScore, err
}
priorityScore += score
}
if pfn, found := ssn.nodeMapFns[plugin.Name]; found {
score, err := pfn(task, node)
if err != nil {
return nodeScoreMap, priorityScore, err
}
nodeScoreMap[plugin.Name] = score
}
}
}
return nodeScoreMap, priorityScore, nil
}
// NodeOrderReduceFn invoke node order function of the plugins
func (ssn *Session) NodeOrderReduceFn(task *api.TaskInfo, pluginNodeScoreMap map[string]k8sframework.NodeScoreList) (map[string]float64, error) {
nodeScoreMap := map[string]float64{}
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if !isEnabled(plugin.EnabledNodeOrder) {
continue
}
pfn, found := ssn.nodeReduceFns[plugin.Name]
if !found {
continue
}
if err := pfn(task, pluginNodeScoreMap[plugin.Name]); err != nil {
return nodeScoreMap, err
}
for _, hp := range pluginNodeScoreMap[plugin.Name] {
nodeScoreMap[hp.Name] += float64(hp.Score)
}
}
}
return nodeScoreMap, nil
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"fmt"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/metrics"
)
// Operation type
type Operation int8
const (
// Evict op
Evict = iota
// Pipeline op
Pipeline
// Allocate op
Allocate
)
type operation struct {
name Operation
task *api.TaskInfo
reason string
}
// Statement structure
type Statement struct {
operations []operation
ssn *Session
}
// NewStatement returns new statement object
func NewStatement(ssn *Session) *Statement {
return &Statement{
ssn: ssn,
}
}
// Evict the pod
func (s *Statement) Evict(reclaimee *api.TaskInfo, reason string) error {
// Update status in session
if job, found := s.ssn.Jobs[reclaimee.Job]; found {
if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
reclaimee.Job, s.ssn.UID)
}
// Update task in node.
if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
err := node.UpdateTask(reclaimee)
if err != nil {
klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
return err
}
}
for _, eh := range s.ssn.eventHandlers {
if eh.DeallocateFunc != nil {
eh.DeallocateFunc(&Event{
Task: reclaimee,
})
}
}
s.operations = append(s.operations, operation{
name: Evict,
task: reclaimee,
reason: reason,
})
return nil
}
func (s *Statement) evict(reclaimee *api.TaskInfo, reason string) error {
if err := s.ssn.cache.Evict(reclaimee, reason); err != nil {
if e := s.unevict(reclaimee); e != nil {
klog.Errorf("Faled to unevict task <%v/%v>: %v.",
reclaimee.Namespace, reclaimee.Name, e)
}
return err
}
return nil
}
func (s *Statement) unevict(reclaimee *api.TaskInfo) error {
// Update status in session
job, found := s.ssn.Jobs[reclaimee.Job]
if found {
if err := job.UpdateTaskStatus(reclaimee, api.Running); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
reclaimee.Job, s.ssn.UID)
}
// Update task in node.
if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
err := node.UpdateTask(reclaimee)
if err != nil {
klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
return err
}
}
for _, eh := range s.ssn.eventHandlers {
if eh.AllocateFunc != nil {
eh.AllocateFunc(&Event{
Task: reclaimee,
})
}
}
return nil
}
// Pipeline the task for the node
func (s *Statement) Pipeline(task *api.TaskInfo, hostname string) error {
job, found := s.ssn.Jobs[task.Job]
if found {
if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, s.ssn.UID)
}
task.NodeName = hostname
if node, found := s.ssn.Nodes[hostname]; found {
if err := node.AddTask(task); err != nil {
klog.Errorf("Failed to pipeline task <%v/%v> to node <%v> in Session <%v>: %v",
task.Namespace, task.Name, hostname, s.ssn.UID, err)
}
klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
} else {
klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
hostname, s.ssn.UID)
}
for _, eh := range s.ssn.eventHandlers {
if eh.AllocateFunc != nil {
eh.AllocateFunc(&Event{
Task: task,
})
}
}
s.operations = append(s.operations, operation{
name: Pipeline,
task: task,
})
return nil
}
func (s *Statement) pipeline(task *api.TaskInfo) {
}
func (s *Statement) unpipeline(task *api.TaskInfo) error {
job, found := s.ssn.Jobs[task.Job]
if found {
if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, s.ssn.UID)
}
if node, found := s.ssn.Nodes[task.NodeName]; found {
if err := node.RemoveTask(task); err != nil {
klog.Errorf("Failed to pipeline task <%v/%v> to node <%v> in Session <%v>: %v",
task.Namespace, task.Name, task.NodeName, s.ssn.UID, err)
}
klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
} else {
klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
task.NodeName, s.ssn.UID)
}
for _, eh := range s.ssn.eventHandlers {
if eh.DeallocateFunc != nil {
eh.DeallocateFunc(&Event{
Task: task,
})
}
}
task.NodeName = ""
return nil
}
// Allocate the task to node
func (s *Statement) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) error {
podVolumes, err := s.ssn.cache.GetPodVolumes(task, nodeInfo.Node)
if err != nil {
return err
}
hostname := nodeInfo.Name
if err := s.ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
return err
}
task.Pod.Spec.NodeName = hostname
task.PodVolumes = podVolumes
// Only update status in session
job, found := s.ssn.Jobs[task.Job]
if found {
if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Allocated, s.ssn.UID, err)
return err
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, s.ssn.UID)
return fmt.Errorf("failed to find job %s", task.Job)
}
task.NodeName = hostname
if node, found := s.ssn.Nodes[hostname]; found {
if err := node.AddTask(task); err != nil {
klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
task.Namespace, task.Name, hostname, s.ssn.UID, err)
return err
}
klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
} else {
klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
hostname, s.ssn.UID)
return fmt.Errorf("failed to find node %s", hostname)
}
// Callbacks
for _, eh := range s.ssn.eventHandlers {
if eh.AllocateFunc != nil {
eh.AllocateFunc(&Event{
Task: task,
})
}
}
// Update status in session
klog.V(3).Info("Allocating operations ...")
s.operations = append(s.operations, operation{
name: Allocate,
task: task,
})
return nil
}
func (s *Statement) allocate(task *api.TaskInfo) error {
if err := s.ssn.cache.AddBindTask(task); err != nil {
return err
}
if job, found := s.ssn.Jobs[task.Job]; found {
if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Binding, s.ssn.UID, err)
return err
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
task.Job, s.ssn.UID)
return fmt.Errorf("failed to find job %s", task.Job)
}
metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
return nil
}
// unallocate the pod for task
func (s *Statement) unallocate(task *api.TaskInfo) error {
// Update status in session
job, found := s.ssn.Jobs[task.Job]
if found {
if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
task.Namespace, task.Name, api.Pending, s.ssn.UID, err)
}
} else {
klog.Errorf("Failed to find Job <%s> in Session <%s> index when unallocating.",
task.Job, s.ssn.UID)
}
if node, found := s.ssn.Nodes[task.NodeName]; found {
klog.V(3).Infof("Remove Task <%v> on node <%v>", task.Name, task.NodeName)
err := node.RemoveTask(task)
if err != nil {
klog.Errorf("Failed to remove Task <%v> on node <%v>: %s", task.Name, task.NodeName, err.Error())
}
}
for _, eh := range s.ssn.eventHandlers {
if eh.DeallocateFunc != nil {
eh.DeallocateFunc(&Event{
Task: task,
})
}
}
task.NodeName = ""
return nil
}
// Discard operation for evict, pipeline and allocate
func (s *Statement) Discard() {
klog.V(3).Info("Discarding operations ...")
for i := len(s.operations) - 1; i >= 0; i-- {
op := s.operations[i]
op.task.GenerateLastTxContext()
switch op.name {
case Evict:
err := s.unevict(op.task)
if err != nil {
klog.Errorf("Failed to unevict task: %s", err.Error())
}
case Pipeline:
err := s.unpipeline(op.task)
if err != nil {
klog.Errorf("Failed to unpipeline task: %s", err.Error())
}
case Allocate:
err := s.unallocate(op.task)
if err != nil {
klog.Errorf("Failed to unallocate task: %s", err.Error())
}
}
}
}
// Commit operation for evict and pipeline
func (s *Statement) Commit() {
klog.V(3).Info("Committing operations ...")
for _, op := range s.operations {
op.task.ClearLastTxContext()
switch op.name {
case Evict:
err := s.evict(op.task, op.reason)
if err != nil {
klog.Errorf("Failed to evict task: %s", err.Error())
}
case Pipeline:
s.pipeline(op.task)
case Allocate:
err := s.allocate(op.task)
if err != nil {
klog.Errorf("Failed to allocate task: for %s", err.Error())
}
}
}
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package binpack
import (
"fmt"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
)
const (
// PluginName indicates name of volcano scheduler plugin.
PluginName = "binpack"
)
const (
// BinpackWeight is the key for providing Binpack Priority Weight in YAML
BinpackWeight = "binpack.weight"
// BinpackCPU is the key for weight of cpu
BinpackCPU = "binpack.cpu"
// BinpackMemory is the key for weight of memory
BinpackMemory = "binpack.memory"
// BinpackResources is the key for additional resource key name
BinpackResources = "binpack.resources"
// BinpackResourcesPrefix is the key prefix for additional resource key name
BinpackResourcesPrefix = BinpackResources + "."
resourceFmt = "%s[%d]"
)
type priorityWeight struct {
BinPackingWeight int
BinPackingCPU int
BinPackingMemory int
BinPackingResources map[v1.ResourceName]int
}
func (w *priorityWeight) String() string {
length := 3
if extendLength := len(w.BinPackingResources); extendLength == 0 {
length++
} else {
length += extendLength
}
msg := make([]string, 0, length)
msg = append(msg,
fmt.Sprintf(resourceFmt, BinpackWeight, w.BinPackingWeight),
fmt.Sprintf(resourceFmt, BinpackCPU, w.BinPackingCPU),
fmt.Sprintf(resourceFmt, BinpackMemory, w.BinPackingMemory),
)
if len(w.BinPackingResources) == 0 {
msg = append(msg, "no extend resources.")
} else {
for name, weight := range w.BinPackingResources {
msg = append(msg, fmt.Sprintf(resourceFmt, name, weight))
}
}
return strings.Join(msg, ", ")
}
type binpackPlugin struct {
// Arguments given for the plugin
weight priorityWeight
}
//New function returns prioritizePlugin object
func New(aruguments framework.Arguments) framework.Plugin {
weight := calculateWeight(aruguments)
return &binpackPlugin{weight: weight}
}
func calculateWeight(args framework.Arguments) priorityWeight {
/*
User Should give priorityWeight in this format(binpack.weight, binpack.cpu, binpack.memory).
Support change the weight about cpu, memory and additional resource by arguments.
actions: "enqueue, reclaim, allocate, backfill, preempt"
tiers:
- plugins:
- name: binpack
arguments:
binpack.weight: 10
binpack.cpu: 5
binpack.memory: 1
binpack.resources: nvidia.com/gpu, example.com/foo
binpack.resources.nvidia.com/gpu: 2
binpack.resources.example.com/foo: 3
*/
// Values are initialized to 1.
weight := priorityWeight{
BinPackingWeight: 1,
BinPackingCPU: 1,
BinPackingMemory: 1,
BinPackingResources: make(map[v1.ResourceName]int),
}
// Checks whether binpack.weight is provided or not, if given, modifies the value in weight struct.
args.GetInt(&weight.BinPackingWeight, BinpackWeight)
// Checks whether binpack.cpu is provided or not, if given, modifies the value in weight struct.
args.GetInt(&weight.BinPackingCPU, BinpackCPU)
if weight.BinPackingCPU < 0 {
weight.BinPackingCPU = 1
}
// Checks whether binpack.memory is provided or not, if given, modifies the value in weight struct.
args.GetInt(&weight.BinPackingMemory, BinpackMemory)
if weight.BinPackingMemory < 0 {
weight.BinPackingMemory = 1
}
resourcesStr := args[BinpackResources]
resources := strings.Split(resourcesStr, ",")
for _, resource := range resources {
resource = strings.TrimSpace(resource)
if resource == "" {
continue
}
// binpack.resources.[ResourceName]
resourceKey := BinpackResourcesPrefix + resource
resourceWeight := 1
args.GetInt(&resourceWeight, resourceKey)
if resourceWeight < 0 {
resourceWeight = 1
}
weight.BinPackingResources[v1.ResourceName(resource)] = resourceWeight
}
return weight
}
func (bp *binpackPlugin) Name() string {
return PluginName
}
func (bp *binpackPlugin) OnSessionOpen(ssn *framework.Session) {
klog.V(4).Infof("Enter binpack plugin ...")
if klog.V(4) {
defer func() {
klog.V(4).Infof("Leaving binpack plugin. %s ...", bp.weight.String())
}()
notFoundResource := []string{}
for resource := range bp.weight.BinPackingResources {
found := false
for _, nodeInfo := range ssn.Nodes {
if nodeInfo.Allocatable.Get(resource) > 0 {
found = true
break
}
}
if !found {
notFoundResource = append(notFoundResource, string(resource))
}
}
klog.V(4).Infof("resources [%s] record in weight but not found on any node", strings.Join(notFoundResource, ", "))
}
nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
binPackingScore := BinPackingScore(task, node, bp.weight)
klog.V(4).Infof("Binpack score for Task %s/%s on node %s is: %v", task.Namespace, task.Name, node.Name, binPackingScore)
return binPackingScore, nil
}
if bp.weight.BinPackingWeight != 0 {
ssn.AddNodeOrderFn(bp.Name(), nodeOrderFn)
} else {
klog.Infof("binpack weight is zero, skip node order function")
}
}
func (bp *binpackPlugin) OnSessionClose(ssn *framework.Session) {
}
// BinPackingScore use the best fit polices during scheduling.
// Goals:
// - Schedule Jobs using BestFit Policy using Resource Bin Packing Priority Function
// - Reduce Fragmentation of scarce resources on the Cluster
func BinPackingScore(task *api.TaskInfo, node *api.NodeInfo, weight priorityWeight) float64 {
score := 0.0
weightSum := 0
requested := task.Resreq
allocatable := node.Allocatable
used := node.Used
for _, resource := range requested.ResourceNames() {
request := requested.Get(resource)
if request == 0 {
continue
}
allocate := allocatable.Get(resource)
nodeUsed := used.Get(resource)
resourceWeight := 0
found := false
switch resource {
case v1.ResourceCPU:
resourceWeight = weight.BinPackingCPU
found = true
case v1.ResourceMemory:
resourceWeight = weight.BinPackingMemory
found = true
default:
resourceWeight, found = weight.BinPackingResources[resource]
}
if !found {
continue
}
resourceScore := ResourceBinPackingScore(request, allocate, nodeUsed, resourceWeight)
klog.V(5).Infof("task %s/%s on node %s resource %s, need %f, used %f, allocatable %f, weight %d, score %f", task.Namespace, task.Name, node.Name, resource, request, nodeUsed, allocate, resourceWeight, resourceScore)
score += resourceScore
weightSum += resourceWeight
}
// mapping the result from [0, weightSum] to [0, 10(MaxPriority)]
if weightSum > 0 {
score /= float64(weightSum)
}
score *= float64(v1alpha1.MaxNodeScore * int64(weight.BinPackingWeight))
return score
}
// ResourceBinPackingScore calculate the binpack score for resource with provided info
func ResourceBinPackingScore(requested, capacity, used float64, weight int) float64 {
if capacity == 0 || weight == 0 {
return 0
}
usedFinally := requested + used
if usedFinally > capacity {
return 0
}
score := usedFinally * float64(weight) / capacity
return score
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package drf
import (
"fmt"
"math"
"strconv"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/api/helpers"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/metrics"
"volcano.sh/volcano/pkg/scheduler/plugins/util"
)
// PluginName indicates name of volcano scheduler plugin.
const PluginName = "drf"
var shareDelta = 0.000001
// hierarchicalNode represents the node hierarchy
// and the corresponding weight and drf attribute
type hierarchicalNode struct {
parent *hierarchicalNode
attr *drfAttr
// If the node is a leaf node,
// request represents the request of the job.
request *api.Resource
weight float64
saturated bool
hierarchy string
children map[string]*hierarchicalNode
}
func (node *hierarchicalNode) Clone(parent *hierarchicalNode) *hierarchicalNode {
newNode := &hierarchicalNode{
parent: parent,
attr: &drfAttr{
share: node.attr.share,
dominantResource: node.attr.dominantResource,
allocated: node.attr.allocated.Clone(),
},
request: node.request.Clone(),
weight: node.weight,
saturated: node.saturated,
hierarchy: node.hierarchy,
children: nil,
}
if node.children != nil {
newNode.children = map[string]*hierarchicalNode{}
for _, child := range node.children {
newNode.children[child.hierarchy] = child.Clone(newNode)
}
}
return newNode
}
// resourceSaturated returns true if any resource of the job is saturated or the job demands fully allocated resource
func resourceSaturated(allocated *api.Resource,
jobRequest *api.Resource, demandingResources map[v1.ResourceName]bool) bool {
for _, rn := range allocated.ResourceNames() {
if allocated.Get(rn) != 0 && jobRequest.Get(rn) != 0 &&
allocated.Get(rn) >= jobRequest.Get(rn) {
return true
}
if !demandingResources[rn] && jobRequest.Get(rn) != 0 {
return true
}
}
return false
}
type drfAttr struct {
share float64
dominantResource string
allocated *api.Resource
}
func (attr *drfAttr) String() string {
return fmt.Sprintf("dominant resource <%s>, dominant share %f, allocated %s",
attr.dominantResource, attr.share, attr.allocated)
}
type drfPlugin struct {
totalResource *api.Resource
totalAllocated *api.Resource
// Key is Job ID
jobAttrs map[api.JobID]*drfAttr
// map[namespaceName]->attr
namespaceOpts map[string]*drfAttr
// hierarchical tree root
hierarchicalRoot *hierarchicalNode
// Arguments given for the plugin
pluginArguments framework.Arguments
}
// New return drf plugin
func New(arguments framework.Arguments) framework.Plugin {
return &drfPlugin{
totalResource: api.EmptyResource(),
totalAllocated: api.EmptyResource(),
jobAttrs: map[api.JobID]*drfAttr{},
namespaceOpts: map[string]*drfAttr{},
hierarchicalRoot: &hierarchicalNode{
attr: &drfAttr{allocated: api.EmptyResource()},
request: api.EmptyResource(),
hierarchy: "root",
weight: 1,
children: map[string]*hierarchicalNode{},
},
pluginArguments: arguments,
}
}
func (drf *drfPlugin) Name() string {
return PluginName
}
// HierarchyEnabled returns if hierarchy is enabled
func (drf *drfPlugin) HierarchyEnabled(ssn *framework.Session) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if plugin.Name != PluginName {
continue
}
return plugin.EnabledHierarchy != nil && *plugin.EnabledHierarchy
}
}
return false
}
// NamespaceOrderEnabled returns the NamespaceOrder for this plugin is enabled in this session or not
func (drf *drfPlugin) NamespaceOrderEnabled(ssn *framework.Session) bool {
for _, tier := range ssn.Tiers {
for _, plugin := range tier.Plugins {
if plugin.Name != PluginName {
continue
}
return plugin.EnabledNamespaceOrder != nil && *plugin.EnabledNamespaceOrder
}
}
return false
}
func (drf *drfPlugin) compareQueues(root *hierarchicalNode, lqueue *api.QueueInfo, rqueue *api.QueueInfo) float64 {
lnode := root
lpaths := strings.Split(lqueue.Hierarchy, "/")
rnode := root
rpaths := strings.Split(rqueue.Hierarchy, "/")
depth := 0
if len(lpaths) < len(rpaths) {
depth = len(lpaths)
} else {
depth = len(rpaths)
}
for i := 0; i < depth; i++ {
// Saturated nodes have minumun prioirty,
// so that demanding nodes will be poped first.
if !lnode.saturated && rnode.saturated {
return -1
}
if lnode.saturated && !rnode.saturated {
return 1
}
if lnode.attr.share/lnode.weight == rnode.attr.share/rnode.weight {
if i < depth-1 {
lnode = lnode.children[lpaths[i+1]]
rnode = rnode.children[rpaths[i+1]]
}
} else {
return lnode.attr.share/lnode.weight - rnode.attr.share/rnode.weight
}
}
return 0
}
func (drf *drfPlugin) OnSessionOpen(ssn *framework.Session) {
// Prepare scheduling data for this session.
drf.totalResource.Add(ssn.TotalResource)
klog.V(4).Infof("Total Allocatable %s", drf.totalResource)
namespaceOrderEnabled := drf.NamespaceOrderEnabled(ssn)
hierarchyEnabled := drf.HierarchyEnabled(ssn)
for _, job := range ssn.Jobs {
attr := &drfAttr{
allocated: api.EmptyResource(),
}
for status, tasks := range job.TaskStatusIndex {
if api.AllocatedStatus(status) {
for _, t := range tasks {
attr.allocated.Add(t.Resreq)
}
}
}
// Calculate the init share of Job
drf.updateJobShare(job.Namespace, job.Name, attr)
drf.jobAttrs[job.UID] = attr
if namespaceOrderEnabled {
nsOpts, found := drf.namespaceOpts[job.Namespace]
if !found {
nsOpts = &drfAttr{
allocated: api.EmptyResource(),
}
drf.namespaceOpts[job.Namespace] = nsOpts
}
// all task in job should have the same namespace with job
nsOpts.allocated.Add(attr.allocated)
drf.updateNamespaceShare(job.Namespace, nsOpts)
}
if hierarchyEnabled {
queue := ssn.Queues[job.Queue]
drf.totalAllocated.Add(attr.allocated)
drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
}
}
preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
var victims []*api.TaskInfo
addVictim := func(candidate *api.TaskInfo) {
victims = append(victims, candidate)
}
if namespaceOrderEnabled {
// apply the namespace share policy on preemptee firstly
lWeight := ssn.NamespaceInfo[api.NamespaceName(preemptor.Namespace)].GetWeight()
lNsAtt := drf.namespaceOpts[preemptor.Namespace]
lNsAlloc := lNsAtt.allocated.Clone().Add(preemptor.Resreq)
_, lNsShare := drf.calculateShare(lNsAlloc, drf.totalResource)
lNsShareWeighted := lNsShare / float64(lWeight)
namespaceAllocation := map[string]*api.Resource{}
// undecidedPreemptees means this policy could not judge preemptee is preemptable or not
// and left it to next policy
undecidedPreemptees := []*api.TaskInfo{}
for _, preemptee := range preemptees {
if preemptor.Namespace == preemptee.Namespace {
// policy is disabled when they are in the same namespace
undecidedPreemptees = append(undecidedPreemptees, preemptee)
continue
}
// compute the preemptee namespace weighted share after preemption
nsAllocation, found := namespaceAllocation[preemptee.Namespace]
if !found {
rNsAtt := drf.namespaceOpts[preemptee.Namespace]
nsAllocation = rNsAtt.allocated.Clone()
namespaceAllocation[preemptee.Namespace] = nsAllocation
}
rWeight := ssn.NamespaceInfo[api.NamespaceName(preemptee.Namespace)].GetWeight()
rNsAlloc := nsAllocation.Sub(preemptee.Resreq)
_, rNsShare := drf.calculateShare(rNsAlloc, drf.totalResource)
rNsShareWeighted := rNsShare / float64(rWeight)
// to avoid ping pong actions, the preemptee namespace should
// have the higher weighted share after preemption.
if lNsShareWeighted < rNsShareWeighted {
addVictim(preemptee)
continue
}
if lNsShareWeighted-rNsShareWeighted > shareDelta {
continue
}
// equal namespace order leads to judgement of jobOrder
undecidedPreemptees = append(undecidedPreemptees, preemptee)
}
preemptees = undecidedPreemptees
}
latt := drf.jobAttrs[preemptor.Job]
lalloc := latt.allocated.Clone().Add(preemptor.Resreq)
_, ls := drf.calculateShare(lalloc, drf.totalResource)
allocations := map[api.JobID]*api.Resource{}
for _, preemptee := range preemptees {
if _, found := allocations[preemptee.Job]; !found {
ratt := drf.jobAttrs[preemptee.Job]
allocations[preemptee.Job] = ratt.allocated.Clone()
}
ralloc := allocations[preemptee.Job].Sub(preemptee.Resreq)
_, rs := drf.calculateShare(ralloc, drf.totalResource)
if ls < rs || math.Abs(ls-rs) <= shareDelta {
addVictim(preemptee)
}
}
klog.V(4).Infof("Victims from DRF plugins are %+v", victims)
return victims, util.Permit
}
ssn.AddPreemptableFn(drf.Name(), preemptableFn)
if hierarchyEnabled {
queueOrderFn := func(l interface{}, r interface{}) int {
lv := l.(*api.QueueInfo)
rv := r.(*api.QueueInfo)
ret := drf.compareQueues(drf.hierarchicalRoot, lv, rv)
if ret < 0 {
return -1
}
if ret > 0 {
return 1
}
return 0
}
ssn.AddQueueOrderFn(drf.Name(), queueOrderFn)
reclaimFn := func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) {
var victims []*api.TaskInfo
// clone hdrf tree
totalAllocated := drf.totalAllocated.Clone()
root := drf.hierarchicalRoot.Clone(nil)
// update reclaimer hdrf
ljob := ssn.Jobs[reclaimer.Job]
lqueue := ssn.Queues[ljob.Queue]
ljob = ljob.Clone()
attr := drf.jobAttrs[ljob.UID]
lattr := &drfAttr{
allocated: attr.allocated.Clone(),
}
lattr.allocated.Add(reclaimer.Resreq)
totalAllocated.Add(reclaimer.Resreq)
drf.updateShare(lattr)
drf.UpdateHierarchicalShare(root, totalAllocated, ljob, lattr, lqueue.Hierarchy, lqueue.Weights)
for _, preemptee := range reclaimees {
rjob := ssn.Jobs[preemptee.Job]
rqueue := ssn.Queues[rjob.Queue]
// update hdrf of reclaimee job
totalAllocated.Sub(preemptee.Resreq)
rjob = rjob.Clone()
attr := drf.jobAttrs[rjob.UID]
rattr := &drfAttr{
allocated: attr.allocated.Clone(),
}
rattr.allocated.Sub(preemptee.Resreq)
drf.updateShare(rattr)
drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
// compare hdrf of queues
ret := drf.compareQueues(root, lqueue, rqueue)
// resume hdrf of reclaimee job
totalAllocated.Add(preemptee.Resreq)
rattr.allocated.Add(preemptee.Resreq)
drf.updateShare(rattr)
drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
if ret < 0 {
victims = append(victims, preemptee)
}
if ret > shareDelta {
continue
}
}
klog.V(4).Infof("Victims from HDRF plugins are %+v", victims)
return victims, util.Permit
}
ssn.AddReclaimableFn(drf.Name(), reclaimFn)
}
jobOrderFn := func(l interface{}, r interface{}) int {
lv := l.(*api.JobInfo)
rv := r.(*api.JobInfo)
klog.V(4).Infof("DRF JobOrderFn: <%v/%v> share state: %v, <%v/%v> share state: %v",
lv.Namespace, lv.Name, drf.jobAttrs[lv.UID].share, rv.Namespace, rv.Name, drf.jobAttrs[rv.UID].share)
if drf.jobAttrs[lv.UID].share == drf.jobAttrs[rv.UID].share {
return 0
}
if drf.jobAttrs[lv.UID].share < drf.jobAttrs[rv.UID].share {
return -1
}
return 1
}
ssn.AddJobOrderFn(drf.Name(), jobOrderFn)
namespaceOrderFn := func(l interface{}, r interface{}) int {
lv := l.(api.NamespaceName)
rv := r.(api.NamespaceName)
lOpt := drf.namespaceOpts[string(lv)]
rOpt := drf.namespaceOpts[string(rv)]
lWeight := ssn.NamespaceInfo[lv].GetWeight()
rWeight := ssn.NamespaceInfo[rv].GetWeight()
klog.V(4).Infof("DRF NamespaceOrderFn: <%v> share state: %f, weight %v, <%v> share state: %f, weight %v",
lv, lOpt.share, lWeight, rv, rOpt.share, rWeight)
lWeightedShare := lOpt.share / float64(lWeight)
rWeightedShare := rOpt.share / float64(rWeight)
metrics.UpdateNamespaceWeight(string(lv), lWeight)
metrics.UpdateNamespaceWeight(string(rv), rWeight)
metrics.UpdateNamespaceWeightedShare(string(lv), lWeightedShare)
metrics.UpdateNamespaceWeightedShare(string(rv), rWeightedShare)
if lWeightedShare == rWeightedShare {
return 0
}
if lWeightedShare < rWeightedShare {
return -1
}
return 1
}
if namespaceOrderEnabled {
ssn.AddNamespaceOrderFn(drf.Name(), namespaceOrderFn)
}
// Register event handlers.
ssn.AddEventHandler(&framework.EventHandler{
AllocateFunc: func(event *framework.Event) {
attr := drf.jobAttrs[event.Task.Job]
attr.allocated.Add(event.Task.Resreq)
job := ssn.Jobs[event.Task.Job]
drf.updateJobShare(job.Namespace, job.Name, attr)
nsShare := -1.0
if namespaceOrderEnabled {
nsOpt := drf.namespaceOpts[event.Task.Namespace]
nsOpt.allocated.Add(event.Task.Resreq)
drf.updateNamespaceShare(event.Task.Namespace, nsOpt)
nsShare = nsOpt.share
}
if hierarchyEnabled {
queue := ssn.Queues[job.Queue]
drf.totalAllocated.Add(event.Task.Resreq)
drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
}
klog.V(4).Infof("DRF AllocateFunc: task <%v/%v>, resreq <%v>, share <%v>, namespace share <%v>",
event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
},
DeallocateFunc: func(event *framework.Event) {
attr := drf.jobAttrs[event.Task.Job]
attr.allocated.Sub(event.Task.Resreq)
job := ssn.Jobs[event.Task.Job]
drf.updateJobShare(job.Namespace, job.Name, attr)
nsShare := -1.0
if namespaceOrderEnabled {
nsOpt := drf.namespaceOpts[event.Task.Namespace]
nsOpt.allocated.Sub(event.Task.Resreq)
drf.updateNamespaceShare(event.Task.Namespace, nsOpt)
nsShare = nsOpt.share
}
if hierarchyEnabled {
queue := ssn.Queues[job.Queue]
drf.totalAllocated.Sub(event.Task.Resreq)
drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
}
klog.V(4).Infof("DRF EvictFunc: task <%v/%v>, resreq <%v>, share <%v>, namespace share <%v>",
event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
},
})
}
func (drf *drfPlugin) updateNamespaceShare(namespaceName string, attr *drfAttr) {
drf.updateShare(attr)
metrics.UpdateNamespaceShare(namespaceName, attr.share)
}
// build hierarchy if the node does not exist
func (drf *drfPlugin) buildHierarchy(root *hierarchicalNode, job *api.JobInfo, attr *drfAttr,
hierarchy, hierarchicalWeights string) {
inode := root
paths := strings.Split(hierarchy, "/")
weights := strings.Split(hierarchicalWeights, "/")
for i := 1; i < len(paths); i++ {
if child, ok := inode.children[paths[i]]; ok {
inode = child
} else {
fweight, _ := strconv.ParseFloat(weights[i], 64)
if fweight < 1 {
fweight = 1
}
child = &hierarchicalNode{
weight: fweight,
hierarchy: paths[i],
request: api.EmptyResource(),
attr: &drfAttr{
allocated: api.EmptyResource(),
},
children: make(map[string]*hierarchicalNode),
}
klog.V(4).Infof("Node %s added to %s, weight %f",
child.hierarchy, inode.hierarchy, fweight)
inode.children[paths[i]] = child
child.parent = inode
inode = child
}
}
child := &hierarchicalNode{
weight: 1,
attr: attr,
hierarchy: string(job.UID),
request: job.TotalRequest.Clone(),
children: nil,
}
inode.children[string(job.UID)] = child
// update drf attribute bottom up
klog.V(4).Infof("Job <%s/%s> added to %s, weights %s, attr %v, total request: %s",
job.Namespace, job.Name, inode.hierarchy, hierarchicalWeights, child.attr, job.TotalRequest)
}
// updateNamespaceShare updates the node attribute recursively
func (drf *drfPlugin) updateHierarchicalShare(node *hierarchicalNode,
demandingResources map[v1.ResourceName]bool) {
if node.children == nil {
node.saturated = resourceSaturated(node.attr.allocated,
node.request, demandingResources)
klog.V(4).Infof("Update hierarchical node %s, share %f, dominant %s, resource %v, saturated: %t",
node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
} else {
var mdr float64 = 1
// get minimun dominant resource share
for _, child := range node.children {
drf.updateHierarchicalShare(child, demandingResources)
// skip empty child and saturated child
if child.attr.share != 0 && !child.saturated {
_, resShare := drf.calculateShare(child.attr.allocated, drf.totalResource)
if resShare < mdr {
mdr = resShare
}
}
}
node.attr.allocated = api.EmptyResource()
saturated := true
for _, child := range node.children {
if !child.saturated {
saturated = false
}
// only consider non-empty children
if child.attr.share != 0 {
// saturated child is not scaled
if child.saturated {
t := child.attr.allocated
node.attr.allocated.Add(t)
} else {
t := child.attr.allocated.Clone().Multi(mdr / child.attr.share)
node.attr.allocated.Add(t)
}
}
}
node.attr.dominantResource, node.attr.share = drf.calculateShare(
node.attr.allocated, drf.totalResource)
node.saturated = saturated
klog.V(4).Infof("Update hierarchical node %s, share %f, dominant resource %s, resource %v, saturated: %t",
node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
}
}
func (drf *drfPlugin) UpdateHierarchicalShare(root *hierarchicalNode, totalAllocated *api.Resource, job *api.JobInfo, attr *drfAttr, hierarchy, hierarchicalWeights string) {
// filter out demanding resources
demandingResources := map[v1.ResourceName]bool{}
for _, rn := range drf.totalResource.ResourceNames() {
if totalAllocated.Get(rn) < drf.totalResource.Get(rn) {
demandingResources[rn] = true
}
}
drf.buildHierarchy(root, job, attr, hierarchy, hierarchicalWeights)
drf.updateHierarchicalShare(root, demandingResources)
}
func (drf *drfPlugin) updateJobShare(jobNs, jobName string, attr *drfAttr) {
drf.updateShare(attr)
metrics.UpdateJobShare(jobNs, jobName, attr.share)
}
func (drf *drfPlugin) updateShare(attr *drfAttr) {
attr.dominantResource, attr.share = drf.calculateShare(attr.allocated, drf.totalResource)
}
func (drf *drfPlugin) calculateShare(allocated, totalResource *api.Resource) (string, float64) {
res := float64(0)
dominantResource := ""
for _, rn := range totalResource.ResourceNames() {
share := helpers.Share(allocated.Get(rn), totalResource.Get(rn))
if share > res {
res = share
dominantResource = string(rn)
}
}
return dominantResource, res
}
func (drf *drfPlugin) OnSessionClose(session *framework.Session) {
// Clean schedule data.
drf.totalResource = api.EmptyResource()
drf.totalAllocated = api.EmptyResource()
drf.jobAttrs = map[api.JobID]*drfAttr{}
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package policy
import (
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
"volcano.sh/volcano/pkg/scheduler/api"
)
// TopologyHint is a struct containing the NUMANodeAffinity for a Container
type TopologyHint struct {
NUMANodeAffinity bitmask.BitMask
// Preferred is set to true when the NUMANodeAffinity encodes a preferred
// allocation for the Container. It is set to false otherwise.
Preferred bool
}
// Policy is an interface for topology manager policy
type Policy interface {
// Predicate Get the best hit.
Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
}
// HintProvider is an interface for components that want to collaborate to
// achieve globally optimal concrete resource alignment with respect to
// NUMA locality.
type HintProvider interface {
// Name returns provider name used for register and logging.
Name() string
// GetTopologyHints returns hints if this hint provider has a preference,
GetTopologyHints(container *v1.Container, topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string][]TopologyHint
Allocate(container *v1.Container, bestHit *TopologyHint, topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string]cpuset.CPUSet
}
// GetPolicy return the interface matched the input task topology config
func GetPolicy(node *api.NodeInfo, numaNodes []int) Policy {
switch batch.NumaPolicy(node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy]) {
case batch.None:
return NewPolicyNone(numaNodes)
case batch.BestEffort:
return NewPolicyBestEffort(numaNodes)
case batch.Restricted:
return NewPolicyRestricted(numaNodes)
case batch.SingleNumaNode:
return NewPolicySingleNumaNode(numaNodes)
}
return &policyNone{}
}
// AccumulateProvidersHints return all TopologyHint collection from different providers
func AccumulateProvidersHints(container *v1.Container,
topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets,
hintProviders []HintProvider) (providersHints []map[string][]TopologyHint) {
for _, provider := range hintProviders {
hints := provider.GetTopologyHints(container, topoInfo, resNumaSets)
providersHints = append(providersHints, hints)
}
return providersHints
}
// Allocate return all resource assignment collection from different providers
func Allocate(container *v1.Container, bestHit *TopologyHint,
topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets, hintProviders []HintProvider) map[string]cpuset.CPUSet {
allResAlloc := make(map[string]cpuset.CPUSet)
for _, provider := range hintProviders {
resAlloc := provider.Allocate(container, bestHit, topoInfo, resNumaSets)
for resName, assign := range resAlloc {
allResAlloc[resName] = assign
}
}
return allResAlloc
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package policy
import (
"k8s.io/klog"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
func filterProvidersHints(providersHints []map[string][]TopologyHint) [][]TopologyHint {
var allProviderHints [][]TopologyHint
for _, hints := range providersHints {
// If hints is nil, insert a single, preferred any-numa hint into allProviderHints.
if len(hints) == 0 {
klog.Infof("[numatopo] Hint Provider has no preference for NUMA affinity with any resource")
allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
continue
}
// Otherwise, accumulate the hints for each resource type into allProviderHints.
for resource := range hints {
if hints[resource] == nil {
klog.Infof("[numatopo] Hint Provider has no preference for NUMA affinity with resource '%s'", resource)
allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
continue
}
if len(hints[resource]) == 0 {
klog.Infof("[numatopo] Hint Provider has no possible NUMA affinities for resource '%s'", resource)
allProviderHints = append(allProviderHints, []TopologyHint{{nil, false}})
continue
}
allProviderHints = append(allProviderHints, hints[resource])
}
}
return allProviderHints
}
func mergeFilteredHints(numaNodes []int, filteredHints [][]TopologyHint) TopologyHint {
// Set the default affinity as an any-numa affinity containing the list
// of NUMA Nodes available on this machine.
defaultAffinity, _ := bitmask.NewBitMask(numaNodes...)
// Set the bestHint to return from this function as {nil false}.
// This will only be returned if no better hint can be found when
// merging hints from each hint provider.
bestHint := TopologyHint{defaultAffinity, false}
iterateAllProviderTopologyHints(filteredHints, func(permutation []TopologyHint) {
// Get the NUMANodeAffinity from each hint in the permutation and see if any
// of them encode unpreferred allocations.
mergedHint := mergePermutation(numaNodes, permutation)
// Only consider mergedHints that result in a NUMANodeAffinity > 0 to
// replace the current bestHint.
if mergedHint.NUMANodeAffinity.Count() == 0 {
return
}
// If the current bestHint is non-preferred and the new mergedHint is
// preferred, always choose the preferred hint over the non-preferred one.
if mergedHint.Preferred && !bestHint.Preferred {
bestHint = mergedHint
return
}
// If the current bestHint is preferred and the new mergedHint is
// non-preferred, never update bestHint, regardless of mergedHint's
// narowness.
if !mergedHint.Preferred && bestHint.Preferred {
return
}
// If mergedHint and bestHint has the same preference, only consider
// mergedHints that have a narrower NUMANodeAffinity than the
// NUMANodeAffinity in the current bestHint.
if !mergedHint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) {
return
}
// In all other cases, update bestHint to the current mergedHint
bestHint = mergedHint
})
return bestHint
}
// Iterate over all permutations of hints in 'allProviderHints [][]TopologyHint'.
//
// This procedure is implemented as a recursive function over the set of hints
// in 'allproviderHints[i]'. It applies the function 'callback' to each
// permutation as it is found. It is the equivalent of:
//
// for i := 0; i < len(providerHints[0]); i++
// for j := 0; j < len(providerHints[1]); j++
// for k := 0; k < len(providerHints[2]); k++
// ...
// for z := 0; z < len(providerHints[-1]); z++
// permutation := []TopologyHint{
// providerHints[0][i],
// providerHints[1][j],
// providerHints[2][k],
// ...
// providerHints[-1][z]
// }
// callback(permutation)
func iterateAllProviderTopologyHints(allProviderHints [][]TopologyHint, callback func([]TopologyHint)) {
// Internal helper function to accumulate the permutation before calling the callback.
var iterate func(i int, accum []TopologyHint)
iterate = func(i int, accum []TopologyHint) {
// Base case: we have looped through all providers and have a full permutation.
if i == len(allProviderHints) {
callback(accum)
return
}
// Loop through all hints for provider 'i', and recurse to build the
// the permutation of this hint with all hints from providers 'i++'.
for j := range allProviderHints[i] {
iterate(i+1, append(accum, allProviderHints[i][j]))
}
}
iterate(0, []TopologyHint{})
}
// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
// of their affinity masks. The hint shall be preferred if all hits in the permutation
// are preferred.
func mergePermutation(numaNodes []int, permutation []TopologyHint) TopologyHint {
// Get the NUMANodeAffinity from each hint in the permutation and see if any
// of them encode unpreferred allocations.
preferred := true
defaultAffinity, _ := bitmask.NewBitMask(numaNodes...)
var numaAffinities []bitmask.BitMask
for _, hint := range permutation {
// Only consider hints that have an actual NUMANodeAffinity set.
if hint.NUMANodeAffinity == nil {
numaAffinities = append(numaAffinities, defaultAffinity)
} else {
numaAffinities = append(numaAffinities, hint.NUMANodeAffinity)
}
if !hint.Preferred {
preferred = false
}
}
// Merge the affinities using a bitwise-and operation.
mergedAffinity := bitmask.And(defaultAffinity, numaAffinities...)
// Build a mergedHint from the merged affinity mask, indicating if an
// preferred allocation was used to generate the affinity mask or not.
return TopologyHint{mergedAffinity, preferred}
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package policy
import "k8s.io/klog"
type policyBestEffort struct {
numaNodes []int
}
// NewPolicyBestEffort return a new policy interface
func NewPolicyBestEffort(numaNodes []int) Policy {
return &policyBestEffort{numaNodes: numaNodes}
}
func (p *policyBestEffort) canAdmitPodResult(hint *TopologyHint) bool {
return true
}
func (p *policyBestEffort) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
filteredProvidersHints := filterProvidersHints(providersHints)
bestHint := mergeFilteredHints(p.numaNodes, filteredProvidersHints)
admit := p.canAdmitPodResult(&bestHint)
klog.V(4).Infof("bestHint: %v admit %v\n", bestHint, admit)
return bestHint, admit
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package policy
type policyNone struct {
numaNodes []int
}
// NewPolicyNone return a new policy interface
func NewPolicyNone(numaNodes []int) Policy {
return &policyNone{numaNodes: numaNodes}
}
func (policy *policyNone) canAdmitPodResult(hint *TopologyHint) bool {
return true
}
func (policy *policyNone) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
return TopologyHint{}, policy.canAdmitPodResult(nil)
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package policy
import "k8s.io/klog"
type policyRestricted struct {
numaNodes []int
}
// NewPolicyRestricted return a new policy interface
func NewPolicyRestricted(numaNodes []int) Policy {
return &policyRestricted{numaNodes: numaNodes}
}
func (p *policyRestricted) canAdmitPodResult(hint *TopologyHint) bool {
return hint.Preferred
}
func (p *policyRestricted) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
filteredHints := filterProvidersHints(providersHints)
bestHint := mergeFilteredHints(p.numaNodes, filteredHints)
admit := p.canAdmitPodResult(&bestHint)
klog.V(4).Infof("bestHint: %v admit %v\n", bestHint, admit)
return bestHint, admit
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package policy
import "k8s.io/klog"
type policySingleNumaNode struct {
numaNodes []int
}
// NewPolicySingleNumaNode return a new policy interface
func NewPolicySingleNumaNode(numaNodes []int) Policy {
return &policySingleNumaNode{numaNodes: numaNodes}
}
func (policy *policySingleNumaNode) canAdmitPodResult(hint *TopologyHint) bool {
return hint.Preferred
}
// Return hints that have valid bitmasks with exactly one bit set.
func filterSingleNumaHints(allResourcesHints [][]TopologyHint) [][]TopologyHint {
var filteredResourcesHints [][]TopologyHint
for _, oneResourceHints := range allResourcesHints {
var filtered []TopologyHint
for _, hint := range oneResourceHints {
if hint.NUMANodeAffinity == nil && hint.Preferred {
filtered = append(filtered, hint)
}
if hint.NUMANodeAffinity != nil && hint.NUMANodeAffinity.Count() == 1 && hint.Preferred {
filtered = append(filtered, hint)
}
}
filteredResourcesHints = append(filteredResourcesHints, filtered)
}
return filteredResourcesHints
}
func (policy *policySingleNumaNode) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
filteredHints := filterProvidersHints(providersHints)
singleNumaHints := filterSingleNumaHints(filteredHints)
bestHint := mergeFilteredHints(policy.numaNodes, singleNumaHints)
klog.V(4).Infof("bestHint: %v\n", bestHint)
admit := policy.canAdmitPodResult(&bestHint)
return bestHint, admit
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"sort"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type cpuAccumulator struct {
topo *topology.CPUTopology
details topology.CPUDetails
numCPUsNeeded int
result cpuset.CPUSet
}
func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
return &cpuAccumulator{
topo: topo,
details: topo.CPUDetails.KeepOnly(availableCPUs),
numCPUsNeeded: numCPUs,
result: cpuset.NewCPUSet(),
}
}
func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
a.result = a.result.Union(cpus)
a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
a.numCPUsNeeded -= cpus.Size()
}
// isSocketFree Returns true if the supplied socket is fully available in `topoDetails`.
func (a *cpuAccumulator) isSocketFree(socketID int) bool {
return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket()
}
// isCoreFree Returns true if the supplied core is fully available in `topoDetails`.
func (a *cpuAccumulator) isCoreFree(coreID int) bool {
return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore()
}
// freeSockets Returns free socket IDs as a slice sorted by:
// - socket ID, ascending.
func (a *cpuAccumulator) freeSockets() []int {
return a.details.Sockets().Filter(a.isSocketFree).ToSlice()
}
// freeCores Returns core IDs as a slice sorted by:
// - the number of whole available cores on the socket, ascending
// - socket ID, ascending
// - core ID, ascending
func (a *cpuAccumulator) freeCores() []int {
socketIDs := a.details.Sockets().ToSliceNoSort()
sort.Slice(socketIDs,
func(i, j int) bool {
iCores := a.details.CoresInSockets(socketIDs[i]).Filter(a.isCoreFree)
jCores := a.details.CoresInSockets(socketIDs[j]).Filter(a.isCoreFree)
return iCores.Size() < jCores.Size() || socketIDs[i] < socketIDs[j]
})
coreIDs := []int{}
for _, s := range socketIDs {
coreIDs = append(coreIDs, a.details.CoresInSockets(s).Filter(a.isCoreFree).ToSlice()...)
}
return coreIDs
}
// freeCPUs Returns CPU IDs as a slice sorted by:
// - socket affinity with result
// - number of CPUs available on the same socket
// - number of CPUs available on the same core
// - socket ID.
// - core ID.
func (a *cpuAccumulator) freeCPUs() []int {
result := []int{}
cores := a.details.Cores().ToSlice()
sort.Slice(
cores,
func(i, j int) bool {
iCore := cores[i]
jCore := cores[j]
iCPUs := a.topo.CPUDetails.CPUsInCores(iCore).ToSlice()
jCPUs := a.topo.CPUDetails.CPUsInCores(jCore).ToSlice()
iSocket := a.topo.CPUDetails[iCPUs[0]].SocketID
jSocket := a.topo.CPUDetails[jCPUs[0]].SocketID
// Compute the number of CPUs in the result reside on the same socket
// as each core.
iSocketColoScore := a.topo.CPUDetails.CPUsInSockets(iSocket).Intersection(a.result).Size()
jSocketColoScore := a.topo.CPUDetails.CPUsInSockets(jSocket).Intersection(a.result).Size()
// Compute the number of available CPUs available on the same socket
// as each core.
iSocketFreeScore := a.details.CPUsInSockets(iSocket).Size()
jSocketFreeScore := a.details.CPUsInSockets(jSocket).Size()
// Compute the number of available CPUs on each core.
iCoreFreeScore := a.details.CPUsInCores(iCore).Size()
jCoreFreeScore := a.details.CPUsInCores(jCore).Size()
return iSocketColoScore > jSocketColoScore ||
iSocketFreeScore < jSocketFreeScore ||
iCoreFreeScore < jCoreFreeScore ||
iSocket < jSocket ||
iCore < jCore
})
// For each core, append sorted CPU IDs to result.
for _, core := range cores {
result = append(result, a.details.CPUsInCores(core).ToSlice()...)
}
return result
}
func (a *cpuAccumulator) needs(n int) bool {
return a.numCPUsNeeded >= n
}
func (a *cpuAccumulator) isSatisfied() bool {
return a.numCPUsNeeded < 1
}
func (a *cpuAccumulator) isFailed() bool {
return a.numCPUsNeeded > a.details.CPUs().Size()
}
// takeByTopology return the assigned cpuset
func takeByTopology(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
if acc.isSatisfied() {
return acc.result, nil
}
if acc.isFailed() {
return cpuset.NewCPUSet(), fmt.Errorf("not enough cpus available to satisfy request")
}
// Algorithm: topology-aware best-fit
// 1. Acquire whole sockets, if available and the container requires at
// least a socket's-worth of CPUs.
if acc.needs(acc.topo.CPUsPerSocket()) {
for _, s := range acc.freeSockets() {
klog.V(4).Infof("[cpumanager] takeByTopology: claiming socket [%d]", s)
acc.take(acc.details.CPUsInSockets(s))
if acc.isSatisfied() {
return acc.result, nil
}
if !acc.needs(acc.topo.CPUsPerSocket()) {
break
}
}
}
// 2. Acquire whole cores, if available and the container requires at least
// a core's-worth of CPUs.
if acc.needs(acc.topo.CPUsPerCore()) {
for _, c := range acc.freeCores() {
klog.V(4).Infof("[cpumanager] takeByTopology: claiming core [%d]", c)
acc.take(acc.details.CPUsInCores(c))
if acc.isSatisfied() {
return acc.result, nil
}
if !acc.needs(acc.topo.CPUsPerCore()) {
break
}
}
}
// 3. Acquire single threads, preferring to fill partially-allocated cores
// on the same sockets as the whole cores we have already taken in this
// allocation.
for _, c := range acc.freeCPUs() {
klog.V(4).Infof("[cpumanager] takeByTopology: claiming CPU [%d]", c)
if acc.needs(1) {
acc.take(cpuset.NewCPUSet(c))
}
if acc.isSatisfied() {
return acc.result, nil
}
}
return cpuset.NewCPUSet(), fmt.Errorf("failed to allocate cpus")
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/plugins/numaaware/policy"
)
type cpuMng struct {
}
// NewProvider return a new provider
func NewProvider() policy.HintProvider {
return &cpuMng{}
}
// Name return the cpu manager name
func (mng *cpuMng) Name() string {
return "cpuMng"
}
// guaranteedCPUs return the intger num of request cpu
func guaranteedCPUs(container *v1.Container) int {
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
return int(cpuQuantity.Value())
}
// generateCPUTopologyHints return the numa topology hints based on
// - availableCPUs
func generateCPUTopologyHints(availableCPUs cpuset.CPUSet, CPUDetails topology.CPUDetails, request int) []policy.TopologyHint {
minAffinitySize := CPUDetails.NUMANodes().Size()
hints := []policy.TopologyHint{}
bitmask.IterateBitMasks(CPUDetails.NUMANodes().ToSlice(), func(mask bitmask.BitMask) {
// First, update minAffinitySize for the current request size.
cpusInMask := CPUDetails.CPUsInNUMANodes(mask.GetBits()...).Size()
if cpusInMask >= request && mask.Count() < minAffinitySize {
minAffinitySize = mask.Count()
}
// Then check to see if we have enough CPUs available on the current
// numa node bitmask to satisfy the CPU request.
numMatching := 0
// Finally, check to see if enough available CPUs remain on the current
// NUMA node combination to satisfy the CPU request.
for _, c := range availableCPUs.ToSlice() {
if mask.IsSet(CPUDetails[c].NUMANodeID) {
numMatching++
}
}
// If they don't, then move onto the next combination.
if numMatching < request {
return
}
// Otherwise, create a new hint from the numa node bitmask and add it to the
// list of hints. We set all hint preferences to 'false' on the first
// pass through.
hints = append(hints, policy.TopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
})
})
// Loop back through all hints and update the 'Preferred' field based on
// counting the number of bits sets in the affinity mask and comparing it
// to the minAffinitySize. Only those with an equal number of bits set (and
// with a minimal set of numa nodes) will be considered preferred.
for i := range hints {
if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
hints[i].Preferred = true
}
}
return hints
}
func (mng *cpuMng) GetTopologyHints(container *v1.Container,
topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string][]policy.TopologyHint {
if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
klog.Warningf("container %s has no cpu request", container.Name)
return nil
}
requestNum := guaranteedCPUs(container)
if requestNum == 0 {
klog.Warningf(" the cpu request isn't integer in container %s", container.Name)
return nil
}
cputopo := &topology.CPUTopology{
NumCPUs: topoInfo.CPUDetail.CPUs().Size(),
NumCores: topoInfo.CPUDetail.Cores().Size() * topoInfo.CPUDetail.Sockets().Size(),
NumSockets: topoInfo.CPUDetail.Sockets().Size(),
CPUDetails: topoInfo.CPUDetail,
}
reserved := cpuset.NewCPUSet()
reservedCPUs, ok := topoInfo.ResReserved[v1.ResourceCPU]
if ok {
// Take the ceiling of the reservation, since fractional CPUs cannot be
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
reserved, _ = takeByTopology(cputopo, cputopo.CPUDetails.CPUs(), numReservedCPUs)
klog.V(4).Infof("[cpumanager] reserve cpuset :%v", reserved)
}
availableCPUSet, ok := resNumaSets[string(v1.ResourceCPU)]
if !ok {
klog.Warningf("no cpu resource")
return nil
}
availableCPUSet = availableCPUSet.Difference(reserved)
klog.V(4).Infof("requested: %d, availableCPUSet: %v", requestNum, availableCPUSet)
return map[string][]policy.TopologyHint{
string(v1.ResourceCPU): generateCPUTopologyHints(availableCPUSet, topoInfo.CPUDetail, requestNum),
}
}
func (mng *cpuMng) Allocate(container *v1.Container, bestHit *policy.TopologyHint,
topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string]cpuset.CPUSet {
cputopo := &topology.CPUTopology{
NumCPUs: topoInfo.CPUDetail.CPUs().Size(),
NumCores: topoInfo.CPUDetail.Cores().Size() * topoInfo.CPUDetail.Sockets().Size(),
NumSockets: topoInfo.CPUDetail.Sockets().Size(),
CPUDetails: topoInfo.CPUDetail,
}
reserved := cpuset.NewCPUSet()
reservedCPUs, ok := topoInfo.ResReserved[v1.ResourceCPU]
if ok {
// Take the ceiling of the reservation, since fractional CPUs cannot be
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
reserved, _ = takeByTopology(cputopo, cputopo.CPUDetails.CPUs(), numReservedCPUs)
klog.V(3).Infof("[cpumanager] reserve cpuset :%v", reserved)
}
requestNum := guaranteedCPUs(container)
availableCPUSet := resNumaSets[string(v1.ResourceCPU)]
availableCPUSet = availableCPUSet.Difference(reserved)
klog.V(4).Infof("alignedCPUs: %v requestNum: %v bestHit %v", availableCPUSet, requestNum, bestHit)
result := cpuset.NewCPUSet()
if bestHit.NUMANodeAffinity != nil {
alignedCPUs := cpuset.NewCPUSet()
for _, numaNodeID := range bestHit.NUMANodeAffinity.GetBits() {
alignedCPUs = alignedCPUs.Union(availableCPUSet.Intersection(cputopo.CPUDetails.CPUsInNUMANodes(numaNodeID)))
}
numAlignedToAlloc := alignedCPUs.Size()
if requestNum < numAlignedToAlloc {
numAlignedToAlloc = requestNum
}
alignedCPUs, err := takeByTopology(cputopo, alignedCPUs, numAlignedToAlloc)
if err != nil {
return map[string]cpuset.CPUSet{
string(v1.ResourceCPU): cpuset.NewCPUSet(),
}
}
result = result.Union(alignedCPUs)
}
// Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
remainingCPUs, err := takeByTopology(cputopo, availableCPUSet.Difference(result), requestNum-result.Size())
if err != nil {
return map[string]cpuset.CPUSet{
string(v1.ResourceCPU): cpuset.NewCPUSet(),
}
}
result = result.Union(remainingCPUs)
return map[string]cpuset.CPUSet{
string(v1.ResourceCPU): result,
}
}
/*
Copyright 2020 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package predicates
import (
"fmt"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
)
type predicateCache struct {
sync.RWMutex
cache map[string]map[string]bool //key_1: nodename key_2:pod uid
}
// predicateCacheNew return cache map
func predicateCacheNew() *predicateCache {
return &predicateCache{
cache: make(map[string]map[string]bool),
}
}
// getPodTemplateUID return pod template key
func getPodTemplateUID(pod *v1.Pod) string {
uid, found := pod.Annotations[batch.PodTemplateKey]
if !found {
return ""
}
return uid
}
// PredicateWithCache: check the predicate result existed in cache
func (pc *predicateCache) PredicateWithCache(nodeName string, pod *v1.Pod) (bool, error) {
podTemplateUID := getPodTemplateUID(pod)
if podTemplateUID == "" {
return false, fmt.Errorf("no anonation of volcano.sh/template-uid in pod %s", pod.Name)
}
pc.RLock()
defer pc.RUnlock()
if nodeCache, exist := pc.cache[nodeName]; exist {
if result, exist := nodeCache[podTemplateUID]; exist {
klog.V(4).Infof("Predicate node %s and pod %s result %v", nodeName, pod.Name, result)
return result, nil
}
}
return false, fmt.Errorf("no information of node %s and pod %s in predicate cache", nodeName, pod.Name)
}
// UpdateCache update cache data
func (pc *predicateCache) UpdateCache(nodeName string, pod *v1.Pod, fit bool) {
podTemplateUID := getPodTemplateUID(pod)
if podTemplateUID == "" {
klog.V(3).Infof("Don't find pod %s template uid", pod.Name)
return
}
pc.Lock()
defer pc.Unlock()
if _, exist := pc.cache[nodeName]; !exist {
podCache := make(map[string]bool)
podCache[podTemplateUID] = fit
pc.cache[nodeName] = podCache
} else {
pc.cache[nodeName][podTemplateUID] = fit
}
}
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package predicates
import (
"fmt"
v1 "k8s.io/api/core/v1"
"volcano.sh/volcano/pkg/scheduler/api"
)
// checkNodeGPUSharingPredicate checks if a gpu sharing pod can be scheduled on a node.
func checkNodeGPUSharingPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) {
// no gpu sharing request
if api.GetGPUResourceOfPod(pod) <= 0 {
return true, nil
}
id := predicateGPU(pod, nodeInfo)
if id < 0 {
return false, fmt.Errorf("no enough gpu memory on single device of node %s", nodeInfo.Name)
}
return true, nil
}
// predicateGPU returns the available GPU ID
func predicateGPU(pod *v1.Pod, node *api.NodeInfo) int {
gpuRequest := api.GetGPUResourceOfPod(pod)
allocatableGPUs := node.GetDevicesIdleGPUMemory()
for devID := 0; devID < len(allocatableGPUs); devID++ {
availableGPU, ok := allocatableGPUs[devID]
if ok {
if availableGPU >= gpuRequest {
return devID
}
}
}
return -1
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package predicates
import (
"context"
"fmt"
"strings"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/plugins/util"
"volcano.sh/volcano/pkg/scheduler/plugins/util/k8s"
)
const (
// PluginName indicates name of volcano scheduler plugin.
PluginName = "predicates"
// GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML
GPUSharingPredicate = "predicate.GPUSharingEnable"
// CachePredicate control cache predicate feature
CachePredicate = "predicate.CacheEnable"
// ProportionalPredicate is the key for enabling Proportional Predicate in YAML
ProportionalPredicate = "predicate.ProportionalEnable"
// ProportionalResource is the key for additional resource key name
ProportionalResource = "predicate.resources"
// ProportionalResourcesPrefix is the key prefix for additional resource key name
ProportionalResourcesPrefix = ProportionalResource + "."
)
type predicatesPlugin struct {
// Arguments given for the plugin
pluginArguments framework.Arguments
}
// New return predicate plugin
func New(arguments framework.Arguments) framework.Plugin {
return &predicatesPlugin{pluginArguments: arguments}
}
func (pp *predicatesPlugin) Name() string {
return PluginName
}
type baseResource struct {
CPU float64
Memory float64
}
type predicateEnable struct {
gpuSharingEnable bool
cacheEnable bool
proportionalEnable bool
proportional map[v1.ResourceName]baseResource
}
func enablePredicate(args framework.Arguments) predicateEnable {
/*
User Should give predicatesEnable in this format(predicate.GPUSharingEnable).
Currently supported only GPUSharing predicate checks.
actions: "reclaim, allocate, backfill, preempt"
tiers:
- plugins:
- name: priority
- name: gang
- name: conformance
- plugins:
- name: drf
- name: predicates
arguments:
predicate.GPUSharingEnable: true
predicate.CacheEnable: true
predicate.ProportionalEnable: true
predicate.resources: nvidia.com/gpu
predicate.resources.nvidia.com/gpu.cpu: 4
predicate.resources.nvidia.com/gpu.memory: 8
- name: proportion
- name: nodeorder
*/
predicate := predicateEnable{
gpuSharingEnable: false,
cacheEnable: false,
proportionalEnable: false,
}
// Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct.
args.GetBool(&predicate.gpuSharingEnable, GPUSharingPredicate)
args.GetBool(&predicate.cacheEnable, CachePredicate)
// Checks whether predicate.ProportionalEnable is provided or not, if given, modifies the value in predicateEnable struct.
args.GetBool(&predicate.proportionalEnable, ProportionalPredicate)
resourcesProportional := make(map[v1.ResourceName]baseResource)
resourcesStr := args[ProportionalResource]
resources := strings.Split(resourcesStr, ",")
for _, resource := range resources {
resource = strings.TrimSpace(resource)
if resource == "" {
continue
}
// proportional.resources.[ResourceName]
cpuResourceKey := ProportionalResourcesPrefix + resource + ".cpu"
cpuResourceRate := 1.0
args.GetFloat64(&cpuResourceRate, cpuResourceKey)
if cpuResourceRate < 0 {
cpuResourceRate = 1.0
}
memoryResourceKey := ProportionalResourcesPrefix + resource + ".memory"
memoryResourceRate := 1.0
args.GetFloat64(&memoryResourceRate, memoryResourceKey)
if memoryResourceRate < 0 {
memoryResourceRate = 1.0
}
r := baseResource{
CPU: cpuResourceRate,
Memory: memoryResourceRate,
}
resourcesProportional[v1.ResourceName(resource)] = r
}
predicate.proportional = resourcesProportional
return predicate
}
func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) {
pl := util.NewPodListerFromNode(ssn)
nodeMap := util.GenerateNodeMapAndSlice(ssn.Nodes)
pCache := predicateCacheNew()
predicate := enablePredicate(pp.pluginArguments)
kubeClient := ssn.KubeClient()
// Register event handlers to update task info in PodLister & nodeMap
ssn.AddEventHandler(&framework.EventHandler{
AllocateFunc: func(event *framework.Event) {
pod := pl.UpdateTask(event.Task, event.Task.NodeName)
nodeName := event.Task.NodeName
node, found := nodeMap[nodeName]
if !found {
klog.Errorf("predicates, update pod %s/%s allocate to NOT EXIST node [%s]", pod.Namespace, pod.Name, nodeName)
return
}
if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 {
nodeInfo, ok := ssn.Nodes[nodeName]
if !ok {
klog.Errorf("Failed to get node %s info from cache", nodeName)
return
}
id := predicateGPU(pod, nodeInfo)
if id < 0 {
klog.Errorf("The node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace)
return
}
dev, ok := nodeInfo.GPUDevices[id]
if !ok {
klog.Errorf("Failed to get GPU %d from node %s", id, nodeName)
return
}
patch := api.AddGPUIndexPatch(id)
pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
if err != nil {
klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err)
return
}
dev.PodMap[string(pod.UID)] = pod
klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName)
}
node.AddPod(pod)
klog.V(4).Infof("predicates, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName)
},
DeallocateFunc: func(event *framework.Event) {
pod := pl.UpdateTask(event.Task, "")
nodeName := event.Task.NodeName
node, found := nodeMap[nodeName]
if !found {
klog.Errorf("predicates, update pod %s/%s allocate from NOT EXIST node [%s]", pod.Namespace, pod.Name, nodeName)
return
}
if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 {
// deallocate pod gpu id
id := api.GetGPUIndex(pod)
patch := api.RemoveGPUIndexPatch()
_, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
if err != nil {
klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err)
return
}
nodeInfo, ok := ssn.Nodes[nodeName]
if !ok {
klog.Errorf("Failed to get node %s info from cache", nodeName)
return
}
if dev, ok := nodeInfo.GPUDevices[id]; ok {
delete(dev.PodMap, string(pod.UID))
}
klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName)
}
err := node.RemovePod(pod)
if err != nil {
klog.Errorf("predicates, remove pod %s/%s from node [%s] error: %v", pod.Namespace, pod.Name, nodeName, err)
return
}
klog.V(4).Infof("predicates, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName)
},
})
// Initialize k8s plugins
// TODO: Add more predicates, k8s.io/kubernetes/pkg/scheduler/framework/plugins/legacy_registry.go
handle := k8s.NewFrameworkHandle(nodeMap, ssn.KubeClient(), ssn.InformerFactory())
// 1. NodeUnschedulable
plugin, _ := nodeunschedulable.New(nil, handle)
nodeUnscheduleFilter := plugin.(*nodeunschedulable.NodeUnschedulable)
// 2. NodeAffinity
plugin, _ = nodeaffinity.New(nil, handle)
nodeAffinityFilter := plugin.(*nodeaffinity.NodeAffinity)
// 3. NodePorts
plugin, _ = nodeports.New(nil, handle)
nodePortFilter := plugin.(*nodeports.NodePorts)
// 4. TaintToleration
plugin, _ = tainttoleration.New(nil, handle)
tolerationFilter := plugin.(*tainttoleration.TaintToleration)
// 5. InterPodAffinity
plArgs := &config.InterPodAffinityArgs{}
plugin, _ = interpodaffinity.New(plArgs, handle)
podAffinityFilter := plugin.(*interpodaffinity.InterPodAffinity)
ssn.AddPredicateFn(pp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) error {
nodeInfo, found := nodeMap[node.Name]
if !found {
return fmt.Errorf("failed to predicates, node info for %s not found", node.Name)
}
if node.Allocatable.MaxTaskNum <= len(nodeInfo.Pods) {
klog.V(4).Infof("NodePodNumber predicates Task <%s/%s> on Node <%s> failed",
task.Namespace, task.Name, node.Name)
return api.NewFitError(task, node, api.NodePodNumberExceeded)
}
state := k8sframework.NewCycleState()
predicateByStablefilter := func(pod *v1.Pod, nodeInfo *k8sframework.NodeInfo) (bool, error) {
// CheckNodeUnschedulable
status := nodeUnscheduleFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
if !status.IsSuccess() {
return false, fmt.Errorf("plugin %s predicates failed %s", nodeunschedulable.Name, status.Message())
}
// Check NodeAffinity
status = nodeAffinityFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
if !status.IsSuccess() {
return false, fmt.Errorf("plugin %s predicates failed %s", nodeaffinity.Name, status.Message())
}
// PodToleratesNodeTaints: TaintToleration
status = tolerationFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
if !status.IsSuccess() {
return false, fmt.Errorf("plugin %s predicates failed %s", tainttoleration.Name, status.Message())
}
return true, nil
}
// Check PredicateWithCache
{
var err error
var fit bool
if predicate.cacheEnable {
fit, err = pCache.PredicateWithCache(node.Name, task.Pod)
if err != nil {
fit, err = predicateByStablefilter(task.Pod, nodeInfo)
pCache.UpdateCache(node.Name, task.Pod, fit)
} else {
if !fit {
err = fmt.Errorf("plugin equivalence cache predicates failed")
}
}
} else {
fit, err = predicateByStablefilter(task.Pod, nodeInfo)
}
if !fit {
return err
}
}
// Check NodePorts
nodePortFilter.PreFilter(context.TODO(), state, task.Pod)
status := nodePortFilter.Filter(context.TODO(), state, nil, nodeInfo)
if !status.IsSuccess() {
return fmt.Errorf("plugin %s predicates failed %s", nodeaffinity.Name, status.Message())
}
// InterPodAffinity Predicate
status = podAffinityFilter.PreFilter(context.TODO(), state, task.Pod)
if !status.IsSuccess() {
return fmt.Errorf("plugin %s pre-predicates failed %s", interpodaffinity.Name, status.Message())
}
status = podAffinityFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
if !status.IsSuccess() {
return fmt.Errorf("plugin %s predicates failed %s", interpodaffinity.Name, status.Message())
}
if predicate.gpuSharingEnable {
// CheckGPUSharingPredicate
fit, err := checkNodeGPUSharingPredicate(task.Pod, node)
if err != nil {
return err
}
klog.V(4).Infof("checkNodeGPUSharingPredicate predicates Task <%s/%s> on Node <%s>: fit %v",
task.Namespace, task.Name, node.Name, fit)
}
if predicate.proportionalEnable {
// Check ProportionalPredicate
fit, err := checkNodeResourceIsProportional(task, node, predicate.proportional)
if err != nil {
return err
}
klog.V(4).Infof("checkNodeResourceIsProportional predicates Task <%s/%s> on Node <%s>: fit %v",
task.Namespace, task.Name, node.Name, fit)
}
return nil
})
}
func (pp *predicatesPlugin) OnSessionClose(ssn *framework.Session) {}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package predicates
import (
"fmt"
v1 "k8s.io/api/core/v1"
"volcano.sh/volcano/pkg/scheduler/api"
)
// checkNodeResourceIsProportional checks if a gpu:cpu:memory is Proportional
func checkNodeResourceIsProportional(task *api.TaskInfo, node *api.NodeInfo, proportional map[v1.ResourceName]baseResource) (bool, error) {
for resourceName := range proportional {
if value, found := task.Resreq.ScalarResources[resourceName]; found && value > 0 {
return true, nil
}
}
for resourceName, resourceRate := range proportional {
if value, found := node.Idle.ScalarResources[resourceName]; found {
cpuReserved := value * resourceRate.CPU
memoryReserved := value * resourceRate.Memory * 1000 * 1000
r := node.Idle.Clone()
r = r.Sub(task.Resreq)
if r.MilliCPU < cpuReserved || r.Memory < memoryReserved {
return false, fmt.Errorf("proportional of resource %s check failed", resourceName)
}
}
}
return true, nil
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tasktopology
import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
)
type reqAction int
const (
reqSub reqAction = iota
reqAdd
)
// Bucket is struct used to classify tasks by affinity and anti-affinity
type Bucket struct {
index int
tasks map[types.UID]*api.TaskInfo
taskNameSet map[string]int
// reqScore is score of resource
// now, we regard 1 CPU and 1 GPU and 1Gi memory as the same score.
reqScore float64
request *api.Resource
boundTask int
node map[string]int
}
// NewBucket create a new empty bucket
func NewBucket() *Bucket {
return &Bucket{
index: 0,
tasks: make(map[types.UID]*api.TaskInfo),
taskNameSet: make(map[string]int),
reqScore: 0,
request: api.EmptyResource(),
boundTask: 0,
node: make(map[string]int),
}
}
// CalcResReq calculates task resources request
func (b *Bucket) CalcResReq(req *api.Resource, action reqAction) {
if req == nil {
return
}
cpu := req.MilliCPU
// treat 1Mi the same as 1m cpu 1m gpu
mem := req.Memory / 1024 / 1024
score := cpu + mem
for _, request := range req.ScalarResources {
score += request
}
switch action {
case reqSub:
b.reqScore -= score
b.request.Sub(req)
case reqAdd:
b.reqScore += score
b.request.Add(req)
default:
klog.V(3).Infof("Invalid action <%v> for resource <%v>", action, req)
}
}
// AddTask adds task into bucket
func (b *Bucket) AddTask(taskName string, task *api.TaskInfo) {
b.taskNameSet[taskName]++
if task.NodeName != "" {
b.node[task.NodeName]++
b.boundTask++
return
}
b.tasks[task.Pod.UID] = task
b.CalcResReq(task.Resreq, reqAdd)
}
// TaskBound binds task to bucket
func (b *Bucket) TaskBound(task *api.TaskInfo) {
b.node[task.NodeName]++
b.boundTask++
delete(b.tasks, task.Pod.UID)
b.CalcResReq(task.Resreq, reqSub)
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tasktopology
import (
"fmt"
"math"
"sort"
"strings"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
)
type topologyType int
const (
selfAntiAffinity topologyType = iota
interAntiAffinity
selfAffinity
interAffinity
)
// map[topologyType]priority, the larger number means the higher priority
var affinityPriority = map[topologyType]int{
selfAntiAffinity: 4,
interAffinity: 3,
selfAffinity: 2,
interAntiAffinity: 1,
}
// JobManager is struct used to save infos about affinity and buckets of a job
type JobManager struct {
jobID api.JobID
buckets []*Bucket
podInBucket map[types.UID]int
podInTask map[types.UID]string
taskOverPod map[string]map[types.UID]struct{}
taskAffinityPriority map[string]int // [taskName] -> priority
taskExistOrder map[string]int
interAffinity map[string]map[string]struct{} // [taskName]->[taskName]
selfAffinity map[string]struct{}
interAntiAffinity map[string]map[string]struct{} // [taskName]->[taskName]
selfAntiAffinity map[string]struct{}
bucketMaxSize int
nodeTaskSet map[string]map[string]int // [nodeName]->[taskName]
}
// NewJobManager creates a new job manager for job
func NewJobManager(jobID api.JobID) *JobManager {
return &JobManager{
jobID: jobID,
buckets: make([]*Bucket, 0),
podInBucket: make(map[types.UID]int),
podInTask: make(map[types.UID]string),
taskOverPod: make(map[string]map[types.UID]struct{}),
taskAffinityPriority: make(map[string]int),
taskExistOrder: make(map[string]int),
interAffinity: make(map[string]map[string]struct{}),
interAntiAffinity: make(map[string]map[string]struct{}),
selfAffinity: make(map[string]struct{}),
selfAntiAffinity: make(map[string]struct{}),
bucketMaxSize: 0,
nodeTaskSet: make(map[string]map[string]int),
}
}
// MarkOutOfBucket indicates task is outside of any bucket
func (jm *JobManager) MarkOutOfBucket(uid types.UID) {
jm.podInBucket[uid] = OutOfBucket
}
// MarkTaskHasTopology indicates task has topology settings
func (jm *JobManager) MarkTaskHasTopology(taskName string, topoType topologyType) {
priority := affinityPriority[topoType]
if priority > jm.taskAffinityPriority[taskName] {
jm.taskAffinityPriority[taskName] = priority
}
}
// ApplyTaskTopology transforms taskTopology to matrix
// affinity: [[a, b], [c]]
// interAffinity:
// a b c
// a - x -
// b x - -
// c - - -
// selfAffinity:
// a b c
// - - x
func (jm *JobManager) ApplyTaskTopology(topo *TaskTopology) {
for _, aff := range topo.Affinity {
if len(aff) == 1 {
taskName := aff[0]
jm.selfAffinity[taskName] = struct{}{}
jm.MarkTaskHasTopology(taskName, selfAffinity)
continue
}
for index, src := range aff {
for _, dst := range aff[:index] {
addAffinity(jm.interAffinity, src, dst)
addAffinity(jm.interAffinity, dst, src)
}
jm.MarkTaskHasTopology(src, interAffinity)
}
}
for _, aff := range topo.AntiAffinity {
if len(aff) == 1 {
taskName := aff[0]
jm.selfAntiAffinity[taskName] = struct{}{}
jm.MarkTaskHasTopology(taskName, selfAntiAffinity)
continue
}
for index, src := range aff {
for _, dst := range aff[:index] {
addAffinity(jm.interAntiAffinity, src, dst)
addAffinity(jm.interAntiAffinity, dst, src)
}
jm.MarkTaskHasTopology(src, interAntiAffinity)
}
}
length := len(topo.TaskOrder)
for index, taskName := range topo.TaskOrder {
jm.taskExistOrder[taskName] = length - index
}
}
// NewBucket creates a new bucket
func (jm *JobManager) NewBucket() *Bucket {
bucket := NewBucket()
bucket.index = len(jm.buckets)
jm.buckets = append(jm.buckets, bucket)
return bucket
}
// AddTaskToBucket adds task into bucket
func (jm *JobManager) AddTaskToBucket(bucketIndex int, taskName string, task *api.TaskInfo) {
bucket := jm.buckets[bucketIndex]
jm.podInBucket[task.Pod.UID] = bucketIndex
bucket.AddTask(taskName, task)
if size := len(bucket.tasks) + bucket.boundTask; size > jm.bucketMaxSize {
jm.bucketMaxSize = size
}
}
// L compared with R, -1 for L < R, 0 for L == R, 1 for L > R
func (jm *JobManager) taskAffinityOrder(L, R *api.TaskInfo) int {
LTaskName := jm.podInTask[L.Pod.UID]
RTaskName := jm.podInTask[R.Pod.UID]
// in the same vk task, they are equal
if LTaskName == RTaskName {
return 0
}
// use user defined order firstly
LOrder := jm.taskExistOrder[LTaskName]
ROrder := jm.taskExistOrder[RTaskName]
if LOrder != ROrder {
if LOrder > ROrder {
return 1
}
return -1
}
LPriority := jm.taskAffinityPriority[LTaskName]
RPriority := jm.taskAffinityPriority[RTaskName]
if LPriority != RPriority {
if LPriority > RPriority {
return 1
}
return -1
}
// all affinity setting of L and R are the same, they are equal
return 0
}
func (jm *JobManager) buildTaskInfo(tasks map[api.TaskID]*api.TaskInfo) []*api.TaskInfo {
taskWithoutBucket := make([]*api.TaskInfo, 0, len(tasks))
for _, task := range tasks {
pod := task.Pod
taskName := getTaskName(task)
if taskName == "" {
jm.MarkOutOfBucket(pod.UID)
continue
}
if _, hasTopology := jm.taskAffinityPriority[taskName]; !hasTopology {
jm.MarkOutOfBucket(pod.UID)
continue
}
jm.podInTask[pod.UID] = taskName
taskSet, ok := jm.taskOverPod[taskName]
if !ok {
taskSet = make(map[types.UID]struct{})
jm.taskOverPod[taskName] = taskSet
}
taskSet[pod.UID] = struct{}{}
taskWithoutBucket = append(taskWithoutBucket, task)
}
return taskWithoutBucket
}
func (jm *JobManager) checkTaskSetAffinity(taskName string, taskNameSet map[string]int, onlyAnti bool) int {
bucketPodAff := 0
if taskName == "" {
return bucketPodAff
}
for taskNameInBucket, count := range taskNameSet {
theSameTask := taskNameInBucket == taskName
if !onlyAnti {
affinity := false
if theSameTask {
_, affinity = jm.selfAffinity[taskName]
} else {
_, affinity = jm.interAffinity[taskName][taskNameInBucket]
}
if affinity {
bucketPodAff += count
}
}
antiAffinity := false
if theSameTask {
_, antiAffinity = jm.selfAntiAffinity[taskName]
} else {
_, antiAffinity = jm.interAntiAffinity[taskName][taskNameInBucket]
}
if antiAffinity {
bucketPodAff -= count
}
}
return bucketPodAff
}
func (jm *JobManager) buildBucket(taskWithOrder []*api.TaskInfo) {
nodeBucketMapping := make(map[string]*Bucket)
for _, task := range taskWithOrder {
klog.V(5).Infof("jobID %s task with order task %s/%s", jm.jobID, task.Namespace, task.Name)
var selectedBucket *Bucket
maxAffinity := math.MinInt32
taskName := getTaskName(task)
if task.NodeName != "" {
// generate bucket by node
maxAffinity = 0
selectedBucket = nodeBucketMapping[task.NodeName]
} else {
for _, bucket := range jm.buckets {
bucketPodAff := jm.checkTaskSetAffinity(taskName, bucket.taskNameSet, false)
// choose the best fit affinity, or balance resource between bucket
if bucketPodAff > maxAffinity {
maxAffinity = bucketPodAff
selectedBucket = bucket
} else if bucketPodAff == maxAffinity && selectedBucket != nil &&
bucket.reqScore < selectedBucket.reqScore {
selectedBucket = bucket
}
}
}
if maxAffinity < 0 || selectedBucket == nil {
selectedBucket = jm.NewBucket()
if task.NodeName != "" {
nodeBucketMapping[task.NodeName] = selectedBucket
}
}
jm.AddTaskToBucket(selectedBucket.index, taskName, task)
}
}
// ConstructBucket builds bucket for tasks
func (jm *JobManager) ConstructBucket(tasks map[api.TaskID]*api.TaskInfo) {
taskWithoutBucket := jm.buildTaskInfo(tasks)
o := TaskOrder{
tasks: taskWithoutBucket,
manager: jm,
}
sort.Sort(sort.Reverse(&o))
jm.buildBucket(o.tasks)
}
// TaskBound binds task to bucket
func (jm *JobManager) TaskBound(task *api.TaskInfo) {
if taskName := getTaskName(task); taskName != "" {
set, ok := jm.nodeTaskSet[task.NodeName]
if !ok {
set = make(map[string]int)
jm.nodeTaskSet[task.NodeName] = set
}
set[taskName]++
}
bucket := jm.GetBucket(task)
if bucket != nil {
bucket.TaskBound(task)
}
}
// GetBucket get bucket inside which task has been
func (jm *JobManager) GetBucket(task *api.TaskInfo) *Bucket {
index, ok := jm.podInBucket[task.Pod.UID]
if !ok || index == OutOfBucket {
return nil
}
bucket := jm.buckets[index]
return bucket
}
func (jm *JobManager) String() string {
// saa: selfAntiAffinity
// iaa: interAntiAffinity
// sa: selfAffinity
// ia: interAffinity
msg := []string{
fmt.Sprintf("%s - job %s max %d || saa: %v - iaa: %v - sa: %v - ia: %v || priority: %v - order: %v || ",
PluginName, jm.jobID, jm.bucketMaxSize,
jm.selfAntiAffinity, jm.interAntiAffinity,
jm.selfAffinity, jm.interAffinity,
jm.taskAffinityPriority, jm.taskExistOrder,
),
}
for _, bucket := range jm.buckets {
bucketMsg := fmt.Sprintf("b:%d -- ", bucket.index)
var info []string
for _, task := range bucket.tasks {
info = append(info, task.Pod.Name)
}
bucketMsg += strings.Join(info, ", ")
bucketMsg += "|"
info = nil
for nodeName, count := range bucket.node {
info = append(info, fmt.Sprintf("n%s-%d", nodeName, count))
}
bucketMsg += strings.Join(info, ", ")
msg = append(msg, "["+bucketMsg+"]")
}
return strings.Join(msg, " ")
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tasktopology
import (
"fmt"
"strings"
"time"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
)
type taskTopologyPlugin struct {
arguments framework.Arguments
weight int
managers map[api.JobID]*JobManager
}
// New function returns taskTopologyPlugin object
func New(arguments framework.Arguments) framework.Plugin {
return &taskTopologyPlugin{
arguments: arguments,
weight: calculateWeight(arguments),
managers: make(map[api.JobID]*JobManager),
}
}
func (p *taskTopologyPlugin) Name() string {
return PluginName
}
// TaskOrderFn returns -1 to make l prior to r.
//
// for example:
// A:
// | bucket1 | bucket2 | out of bucket
// | a1 a3 | a2 | a4
// B:
// | bucket1 | out of bucket
// | b1 b2 | b3
// the right task order should be:
// a1 a3 a2 b1 b2 a4 b3
func (p *taskTopologyPlugin) TaskOrderFn(l interface{}, r interface{}) int {
lv, ok := l.(*api.TaskInfo)
if !ok {
klog.Errorf("Object is not a taskinfo")
}
rv, ok := r.(*api.TaskInfo)
if !ok {
klog.Errorf("Object is not a taskinfo")
}
lvJobManager := p.managers[lv.Job]
rvJobManager := p.managers[rv.Job]
var lvBucket, rvBucket *Bucket
if lvJobManager != nil {
lvBucket = lvJobManager.GetBucket(lv)
} else {
klog.V(4).Infof("No job manager for job <ID: %s>, do not return task order.", lv.Job)
return 0
}
if rvJobManager != nil {
rvBucket = rvJobManager.GetBucket(rv)
} else {
klog.V(4).Infof("No job manager for job <ID: %s>, do not return task order.", rv.Job)
return 0
}
// the one have bucket would always prior to another
lvInBucket := lvBucket != nil
rvInBucket := rvBucket != nil
if lvInBucket != rvInBucket {
if lvInBucket {
return -1
}
return 1
}
// comparison between job is not the duty of this plugin
if lv.Job != rv.Job {
return 0
}
// task out of bucket have no order
if !lvInBucket && !rvInBucket {
return 0
}
// the big bucket should prior to small one
lvHasTask := len(lvBucket.tasks)
rvHasTask := len(rvBucket.tasks)
if lvHasTask != rvHasTask {
if lvHasTask > rvHasTask {
return -1
}
return 1
}
lvBucketIndex := lvBucket.index
rvBucketIndex := rvBucket.index
// in the same bucket, the affinityOrder is ok
if lvBucketIndex == rvBucketIndex {
affinityOrder := lvJobManager.taskAffinityOrder(lv, rv)
return -affinityOrder
}
// the old bucket should prior to young one
if lvBucketIndex < rvBucketIndex {
return -1
}
return 1
}
func (p *taskTopologyPlugin) calcBucketScore(task *api.TaskInfo, node *api.NodeInfo) (int, *JobManager, error) {
// task could never fits the node
maxResource := node.Idle.Clone().Add(node.Releasing)
if req := task.Resreq; req != nil && maxResource.LessPartly(req, api.Zero) {
return 0, nil, nil
}
jobManager, hasManager := p.managers[task.Job]
if !hasManager {
return 0, nil, nil
}
bucket := jobManager.GetBucket(task)
// task out of bucket
if bucket == nil {
return 0, jobManager, nil
}
// 1. bound task in bucket is the base score of this node
score := bucket.node[node.Name]
// 2. task inter/self anti-affinity should be calculated
if nodeTaskSet := jobManager.nodeTaskSet[node.Name]; nodeTaskSet != nil {
taskName := getTaskName(task)
affinityScore := jobManager.checkTaskSetAffinity(taskName, nodeTaskSet, true)
if affinityScore < 0 {
score += affinityScore
}
}
klog.V(4).Infof("task %s/%s, node %s, additional score %d, task %d",
task.Namespace, task.Name, node.Name, score, len(bucket.tasks))
// 3. the other tasks in bucket take into considering
score += len(bucket.tasks)
if bucket.request == nil || bucket.request.LessEqual(maxResource, api.Zero) {
return score, jobManager, nil
}
remains := bucket.request.Clone()
// randomly (by map) take out task to make the bucket fits the node
for bucketTaskID, bucketTask := range bucket.tasks {
// current task should kept in bucket
if bucketTaskID == task.Pod.UID || bucketTask.Resreq == nil {
continue
}
remains.Sub(bucketTask.Resreq)
score--
if remains.LessEqual(maxResource, api.Zero) {
break
}
}
// here, the bucket remained request will always fit the maxResource
return score, jobManager, nil
}
func (p *taskTopologyPlugin) NodeOrderFn(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
score, jobManager, err := p.calcBucketScore(task, node)
if err != nil {
return 0, err
}
fScore := float64(score * p.weight)
if jobManager != nil && jobManager.bucketMaxSize != 0 {
fScore = fScore * float64(v1alpha1.MaxNodeScore) / float64(jobManager.bucketMaxSize)
}
klog.V(4).Infof("task %s/%s at node %s has bucket score %d, score %f",
task.Namespace, task.Name, node.Name, score, fScore)
return fScore, nil
}
func (p *taskTopologyPlugin) AllocateFunc(event *framework.Event) {
task := event.Task
jobManager, hasManager := p.managers[task.Job]
if !hasManager {
return
}
jobManager.TaskBound(task)
}
func (p *taskTopologyPlugin) initBucket(ssn *framework.Session) {
for jobID, job := range ssn.Jobs {
if noPendingTasks(job) {
klog.V(4).Infof("No pending tasks in job <%s/%s> by plugin %s.",
job.Namespace, job.Name, PluginName)
continue
}
jobTopology, err := readTopologyFromPgAnnotations(job)
if err != nil {
klog.V(4).Infof("Failed to read task topology from job <%s/%s> annotations, error: %s.",
job.Namespace, job.Name, err.Error())
continue
}
if jobTopology == nil {
continue
}
manager := NewJobManager(jobID)
manager.ApplyTaskTopology(jobTopology)
manager.ConstructBucket(job.Tasks)
p.managers[job.UID] = manager
}
}
func affinityCheck(job *api.JobInfo, affinity [][]string) error {
if job == nil || affinity == nil {
return fmt.Errorf("empty input, job: %v, affinity: %v", job, affinity)
}
var taskNumber = len(job.Tasks)
var taskRef = make(map[string]bool, taskNumber)
for _, task := range job.Tasks {
tmpStrings := strings.Split(task.Name, "-")
if _, exist := taskRef[tmpStrings[len(tmpStrings)-2]]; !exist {
taskRef[tmpStrings[len(tmpStrings)-2]] = true
}
}
for _, aff := range affinity {
affTasks := make(map[string]bool, len(aff))
for _, task := range aff {
if len(task) == 0 {
continue
}
if _, exist := taskRef[task]; !exist {
return fmt.Errorf("task %s do not exist in job <%s/%s>", task, job.Namespace, job.Name)
}
if _, exist := affTasks[task]; exist {
return fmt.Errorf("task %s is duplicated in job <%s/%s>", task, job.Namespace, job.Name)
}
affTasks[task] = true
}
}
return nil
}
func splitAnnotations(job *api.JobInfo, annotation string) ([][]string, error) {
affinityStr := strings.Split(annotation, ";")
if len(affinityStr) == 0 {
return nil, nil
}
var affinity = make([][]string, len(affinityStr))
for i, str := range affinityStr {
affinity[i] = strings.Split(str, ",")
}
if err := affinityCheck(job, affinity); err != nil {
klog.V(4).Infof("Job <%s/%s> affinity key invalid: %s.",
job.Namespace, job.Name, err.Error())
return nil, err
}
return affinity, nil
}
func readTopologyFromPgAnnotations(job *api.JobInfo) (*TaskTopology, error) {
jobAffinityStr, affinityExist := job.PodGroup.Annotations[JobAffinityAnnotations]
jobAntiAffinityStr, antiAffinityExist := job.PodGroup.Annotations[JobAntiAffinityAnnotations]
taskOrderStr, taskOrderExist := job.PodGroup.Annotations[TaskOrderAnnotations]
if !(affinityExist || antiAffinityExist || taskOrderExist) {
return nil, nil
}
var jobTopology = TaskTopology{
Affinity: nil,
AntiAffinity: nil,
TaskOrder: nil,
}
if affinityExist {
affinities, err := splitAnnotations(job, jobAffinityStr)
if err != nil {
klog.V(4).Infof("Job <%s/%s> affinity key invalid: %s.",
job.Namespace, job.Name, err.Error())
return nil, err
}
jobTopology.Affinity = affinities
}
if antiAffinityExist {
affinities, err := splitAnnotations(job, jobAntiAffinityStr)
if err != nil {
klog.V(4).Infof("Job <%s/%s> anti affinity key invalid: %s.",
job.Namespace, job.Name, err.Error())
return nil, err
}
jobTopology.AntiAffinity = affinities
}
if taskOrderExist {
jobTopology.TaskOrder = strings.Split(taskOrderStr, ",")
if err := affinityCheck(job, [][]string{jobTopology.TaskOrder}); err != nil {
klog.V(4).Infof("Job <%s/%s> task order key invalid: %s.",
job.Namespace, job.Name, err.Error())
return nil, err
}
}
return &jobTopology, nil
}
func (p *taskTopologyPlugin) OnSessionOpen(ssn *framework.Session) {
start := time.Now()
klog.V(3).Infof("start to init task topology plugin, weight[%d], defined order %v", p.weight, affinityPriority)
p.initBucket(ssn)
ssn.AddTaskOrderFn(p.Name(), p.TaskOrderFn)
ssn.AddNodeOrderFn(p.Name(), p.NodeOrderFn)
ssn.AddEventHandler(&framework.EventHandler{
AllocateFunc: p.AllocateFunc,
})
klog.V(3).Infof("finished to init task topology plugin, using time %v", time.Since(start))
}
func (p *taskTopologyPlugin) OnSessionClose(ssn *framework.Session) {
p.managers = nil
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tasktopology
import (
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
)
const (
// PluginName indicates name of volcano scheduler plugin
PluginName = "task-topology"
// PluginWeight is task-topology plugin weight in nodeOrderFn
PluginWeight = "task-topology.weight"
// JobAffinityKey is the key to read in task-topology arguments from job annotations
JobAffinityKey = "volcano.sh/task-topology"
// OutOfBucket indicates task is outside of any bucket
OutOfBucket = -1
// JobAffinityAnnotations is the key to read in task-topology affinity arguments from podgroup annotations
JobAffinityAnnotations = "volcano.sh/task-topology-affinity"
// JobAntiAffinityAnnotations is the key to read in task-topology anti-affinity arguments from podgroup annotations
JobAntiAffinityAnnotations = "volcano.sh/task-topology-anti-affinity"
// TaskOrderAnnotations is the key to read in task-topology task order arguments from podgroup annotations
TaskOrderAnnotations = "volcano.sh/task-topology-task-order"
)
// TaskTopology is struct used to save affinity infos of a job read from job plugin or annotations
type TaskTopology struct {
Affinity [][]string `json:"affinity,omitempty"`
AntiAffinity [][]string `json:"antiAffinity,omitempty"`
TaskOrder []string `json:"taskOrder,omitempty"`
}
func calculateWeight(args framework.Arguments) int {
/*
User Should give taskTopologyWeight in this format(task-topology.weight).
actions: "enqueue, reclaim, allocate, backfill, preempt"
tiers:
- plugins:
- name: task-topology
arguments:
task-topology.weight: 10
*/
// Values are initialized to 1.
weight := 1
args.GetInt(&weight, PluginWeight)
return weight
}
func getTaskName(task *api.TaskInfo) string {
return task.Pod.Annotations[v1alpha1.TaskSpecKey]
}
func addAffinity(m map[string]map[string]struct{}, src, dst string) {
srcMap, ok := m[src]
if !ok {
srcMap = make(map[string]struct{})
m[src] = srcMap
}
srcMap[dst] = struct{}{}
}
func noPendingTasks(job *api.JobInfo) bool {
return len(job.TaskStatusIndex[api.Pending]) == 0
}
// TaskOrder is struct used to save task order
type TaskOrder struct {
tasks []*api.TaskInfo
manager *JobManager
}
func (p *TaskOrder) Len() int { return len(p.tasks) }
func (p *TaskOrder) Swap(l, r int) {
p.tasks[l], p.tasks[r] = p.tasks[r], p.tasks[l]
}
func (p *TaskOrder) Less(l, r int) bool {
L := p.tasks[l]
R := p.tasks[r]
LHasNode := L.NodeName != ""
RHasNode := R.NodeName != ""
if LHasNode || RHasNode {
// the task bounded would have high priority
if LHasNode != RHasNode {
return !LHasNode
}
// all bound, any order is alright
return L.NodeName > R.NodeName
}
result := p.manager.taskAffinityOrder(L, R)
// they have the same taskAffinity order, any order is alright
if result == 0 {
return L.Name > R.Name
}
return result < 0
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tdm
import (
"fmt"
"strings"
"time"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/framework"
tutil "volcano.sh/volcano/pkg/scheduler/plugins/util"
"volcano.sh/volcano/pkg/scheduler/util"
)
const (
// PluginName indicates name of volcano scheduler plugin.
PluginName = "tdm"
// revocableZoneLayout revocable zone layout
revocableZoneLayout = "15:04"
revocableZoneLabelPrefix = "tdm.revocable-zone."
evictPeriodLabel = "tdm.evict.period"
defaultPodEvictNum = 1
)
var lastEvictAt time.Time
/*
actions: "enqueue, reclaim, allocate, preempt"
tiers:
- plugins:
- name: tdm
arguments:
tdm.revocable-zone.rz1: 10:00-21:00
tdm.revocable-zone.rz2: 12:00-14:00
tdm.evict.period: 1m
*/
type tdmPlugin struct {
revocableZone map[string]string
// evictPeriod
// default 1m
evictPeriod time.Duration
}
// New function returns prioritizePlugin object
func New(args framework.Arguments) framework.Plugin {
revocableZone := make(map[string]string)
evictPeriod := time.Minute
for k, v := range args {
if strings.Contains(k, revocableZoneLabelPrefix) {
revocableZone[strings.Replace(k, revocableZoneLabelPrefix, "", 1)] = v
}
}
if period, ok := args[evictPeriodLabel]; ok {
if d, err := time.ParseDuration(period); err == nil {
evictPeriod = d
}
}
return &tdmPlugin{revocableZone, evictPeriod}
}
func (tp *tdmPlugin) Name() string {
return PluginName
}
func parseRevocableZone(rzRaw string) (start, end time.Time, err error) {
rzValues := strings.Split(strings.TrimSpace(rzRaw), "-")
if len(rzValues) != 2 {
err = fmt.Errorf("revocable zone %v format error", rzRaw)
return
}
t1, err := time.Parse(revocableZoneLayout, rzValues[0])
if err != nil {
return
}
t2, err := time.Parse(revocableZoneLayout, rzValues[1])
if err != nil {
return
}
now := time.Now()
start = time.Date(now.Year(), now.Month(), now.Day(), t1.Hour(), t1.Minute(), 0, 0, now.Location())
if t1.After(t2) || t1.Equal(t2) {
end = time.Date(now.Year(), now.Month(), now.Day()+1, t2.Hour(), t2.Minute(), 0, 0, now.Location())
} else {
end = time.Date(now.Year(), now.Month(), now.Day(), t2.Hour(), t2.Minute(), 0, 0, now.Location())
}
return
}
func (tp *tdmPlugin) availableRevocableZone(rz string) error {
// rzRaw format 00:00-23:59
rzRaw, ok := tp.revocableZone[rz]
if !ok {
return fmt.Errorf("revocable zone %v not support", rz)
}
now := time.Now()
start, end, err := parseRevocableZone(rzRaw)
if err != nil {
return err
}
if now.Unix() < start.Unix() || now.Unix() > end.Unix() {
return fmt.Errorf("current time beyond revocable zone %v:%v", rz, rzRaw)
}
return nil
}
func (tp *tdmPlugin) OnSessionOpen(ssn *framework.Session) {
klog.V(4).Infof("Enter tdm plugin ...")
if klog.V(4) {
defer func() {
klog.V(4).Infof("Leaving tdm plugin.")
}()
}
// tdm plugin just handle revocable node
predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) error {
if node.RevocableZone == "" {
return nil
}
if err := tp.availableRevocableZone(node.RevocableZone); err != nil {
return fmt.Errorf("plugin %s predicates %w", tp.Name(), err)
}
klog.V(4).Infof("TDM node %v revocable zone %v:%v is active", node.Name, node.RevocableZone, tp.revocableZone[node.RevocableZone])
if len(task.RevocableZone) == 0 {
msg := fmt.Sprintf("task %s/%s is not allow to dispatch to revocable node %s", task.Namespace, task.Name, node.Name)
return fmt.Errorf("plugin %s predicates %s", tp.Name(), msg)
}
klog.V(4).Infof("TDM filter for Task %s/%s on node %s pass.", task.Namespace, task.Name, node.Name)
return nil
}
// tdm plugin just handle revocable node
nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
score := 0.0
if node.RevocableZone == "" {
return score, nil
}
if err := tp.availableRevocableZone(node.RevocableZone); err != nil {
klog.V(4).Infof("TDM not available %s", err)
return score, err
}
if len(task.RevocableZone) == 0 {
klog.V(4).Infof("TDM task %s/%s is not allow to dispatch to revocable node %s", task.Namespace, task.Name, node.Name)
return score, nil
}
score = float64(v1alpha1.MaxNodeScore)
klog.V(4).Infof("TDM score for Task %s/%s on node %s is: %v", task.Namespace, task.Name, node.Name, score)
return score, nil
}
preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
// for the preemptable or can use revocablezone workload, they can not preempt other tasks.
if preemptor.Preemptable || len(preemptor.RevocableZone) > 0 {
klog.V(4).Infof("TDM task %s/%s is preemptable, do nothing skip", preemptor.Namespace, preemptor.Name)
return nil, tutil.Reject
}
var victims []*api.TaskInfo
tasksMap := make(map[api.JobID][]*api.TaskInfo)
// find preemptable tasks which appear on none revocable node
for _, task := range preemptees {
if !task.Preemptable || task.Status != api.Running {
continue
}
node, ok := ssn.Nodes[task.NodeName]
if !ok {
continue
}
if node.RevocableZone != "" {
continue
}
tasksMap[task.Job] = append(tasksMap[task.Job], task)
}
for jobID, preemptableTasks := range tasksMap {
if job, ok := ssn.Jobs[jobID]; ok {
victims = append(victims, tp.maxVictims(job, preemptableTasks)...)
}
}
klog.V(4).Infof("TDM victims are %+v", victims)
return victims, tutil.Permit
}
victimsFn := func() []*api.TaskInfo {
if lastEvictAt.Add(tp.evictPeriod).After(time.Now()) {
klog.V(4).Infof("TDM next evict time at %v", lastEvictAt)
return nil
}
klog.V(4).Infof("TDM start to find victims")
// find preemptable task on timeout revocable zone node
victims := make([]*api.TaskInfo, 0)
for rz := range tp.revocableZone {
if err := tp.availableRevocableZone(rz); err != nil {
klog.V(4).Infof("TDM revocable zone %v disactive, %v", rz, err)
// rz disactive, then evict preemptable tasks by job from the revocable node
for jobID, preemtableTasks := range tp.revocableNodePreemptableTask(rz, ssn) {
if job, ok := ssn.Jobs[jobID]; ok {
victims = append(victims, tp.maxVictims(job, preemtableTasks)...)
}
}
}
}
// need to consider concurrency?
lastEvictAt = time.Now()
klog.V(4).Infof("TDM got %v victims", len(victims))
return victims
}
jobOrderFn := func(l, r interface{}) int {
lv := l.(*api.JobInfo)
rv := r.(*api.JobInfo)
if lv.Preemptable == rv.Preemptable {
return 0
}
if !lv.Preemptable {
return -1
}
return 1
}
jobPipelinedFn := func(obj interface{}) int {
jobInfo := obj.(*api.JobInfo)
occupied := jobInfo.WaitingTaskNum() + jobInfo.ReadyTaskNum()
if occupied >= jobInfo.MinAvailable {
return tutil.Permit
}
return tutil.Reject
}
jobStarvingFn := func(obj interface{}) bool {
jobInfo := obj.(*api.JobInfo)
// allow none preemptable elastic job (deployment) preempt task
if jobInfo.Preemptable {
return false
}
return len(jobInfo.TaskStatusIndex[api.Pending]) > 0
}
ssn.AddPredicateFn(tp.Name(), predicateFn)
ssn.AddNodeOrderFn(tp.Name(), nodeOrderFn)
ssn.AddPreemptableFn(tp.Name(), preemptableFn)
ssn.AddVictimTasksFns(tp.Name(), victimsFn)
ssn.AddJobOrderFn(tp.Name(), jobOrderFn)
ssn.AddJobPipelinedFn(tp.Name(), jobPipelinedFn)
ssn.AddJobStarvingFns(tp.Name(), jobStarvingFn)
}
func (tp *tdmPlugin) maxVictims(job *api.JobInfo, victims []*api.TaskInfo) []*api.TaskInfo {
maxPodEvictNum := tp.getMaxPodEvictNum(job)
targetNum := util.GetMinInt(maxPodEvictNum, len(victims))
klog.V(3).Infof("Job <%s/%s> max evict:%v, potential victims number:%v, max victims number:%v",
job.Namespace, job.Name, maxPodEvictNum, len(victims), targetNum)
return victims[:targetNum]
}
// get max pod evict number from job budget configure
func (tp *tdmPlugin) getMaxPodEvictNum(job *api.JobInfo) int {
jobRunningTaskNum := len(job.TaskStatusIndex[api.Running])
if job.Budget.MaxUnavilable != "" {
maxUnavilable := tp.parseIntStr(job.Budget.MaxUnavilable, len(job.Tasks))
finalTaskNum := len(job.TaskStatusIndex[api.Succeeded]) + len(job.TaskStatusIndex[api.Failed])
realUnavilable := len(job.Tasks) - finalTaskNum - jobRunningTaskNum
if realUnavilable >= maxUnavilable {
return 0
}
return maxUnavilable - realUnavilable
}
if job.Budget.MinAvailable != "" {
minAvailable := tp.parseIntStr(job.Budget.MinAvailable, len(job.Tasks))
if jobRunningTaskNum >= minAvailable {
return jobRunningTaskNum - minAvailable
}
}
return defaultPodEvictNum
}
func (tp *tdmPlugin) parseIntStr(input string, taskNum int) int {
resultValue := 0
tmp := intstr.Parse(input)
switch tmp.Type {
case intstr.Int:
resultValue = tmp.IntValue()
case intstr.String:
if v, err := intstr.GetValueFromIntOrPercent(&tmp, taskNum, true); err == nil {
resultValue = v
} else {
klog.Warningf("TDM get percent value err: %v", err)
}
}
return resultValue
}
func (tp *tdmPlugin) revocableNodePreemptableTask(rz string, ssn *framework.Session) map[api.JobID][]*api.TaskInfo {
tasksMap := make(map[api.JobID][]*api.TaskInfo)
for _, node := range ssn.RevocableNodes {
if node.RevocableZone != rz {
continue
}
for _, task := range node.Tasks {
if task.Preemptable {
if task.Status == api.Running {
tasksMap[task.Job] = append(tasksMap[task.Job], task)
}
}
}
}
return tasksMap
}
func (tp *tdmPlugin) OnSessionClose(ssn *framework.Session) {}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"path/filepath"
"sync"
"time"
"github.com/fsnotify/fsnotify"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"
"k8s.io/klog"
"volcano.sh/volcano/pkg/filewatcher"
schedcache "volcano.sh/volcano/pkg/scheduler/cache"
"volcano.sh/volcano/pkg/scheduler/conf"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/metrics"
)
// Scheduler watches for new unscheduled pods for volcano. It attempts to find
// nodes that they fit on and writes bindings back to the api server.
type Scheduler struct {
cache schedcache.Cache
schedulerConf string
fileWatcher filewatcher.FileWatcher
schedulePeriod time.Duration
once sync.Once
mutex sync.Mutex
actions []framework.Action
plugins []conf.Tier
configurations []conf.Configuration
}
// NewScheduler returns a scheduler
func NewScheduler(
config *rest.Config,
schedulerName string,
schedulerConf string,
period time.Duration,
defaultQueue string,
nodeSelectors []string,
) (*Scheduler, error) {
var watcher filewatcher.FileWatcher
if schedulerConf != "" {
var err error
path := filepath.Dir(schedulerConf)
watcher, err = filewatcher.NewFileWatcher(path)
if err != nil {
return nil, fmt.Errorf("failed creating filewatcher for %s: %v", schedulerConf, err)
}
}
scheduler := &Scheduler{
schedulerConf: schedulerConf,
fileWatcher: watcher,
cache: schedcache.New(config, schedulerName, defaultQueue, nodeSelectors),
schedulePeriod: period,
}
return scheduler, nil
}
// Run runs the Scheduler
func (pc *Scheduler) Run(stopCh <-chan struct{}) {
pc.loadSchedulerConf()
go pc.watchSchedulerConf(stopCh)
// Start cache for policy.
pc.cache.Run(stopCh)
pc.cache.WaitForCacheSync(stopCh)
klog.V(2).Infof("scheduler completes Initialization and start to run")
go wait.Until(pc.runOnce, pc.schedulePeriod, stopCh)
}
func (pc *Scheduler) runOnce() {
klog.V(4).Infof("Start scheduling ...")
scheduleStartTime := time.Now()
defer klog.V(4).Infof("End scheduling ...")
pc.mutex.Lock()
actions := pc.actions
plugins := pc.plugins
configurations := pc.configurations
pc.mutex.Unlock()
ssn := framework.OpenSession(pc.cache, plugins, configurations)
defer framework.CloseSession(ssn)
for _, action := range actions {
actionStartTime := time.Now()
action.Execute(ssn)
metrics.UpdateActionDuration(action.Name(), metrics.Duration(actionStartTime))
}
metrics.UpdateE2eDuration(metrics.Duration(scheduleStartTime))
}
func (pc *Scheduler) loadSchedulerConf() {
var err error
pc.once.Do(func() {
pc.actions, pc.plugins, pc.configurations, err = unmarshalSchedulerConf(defaultSchedulerConf)
if err != nil {
klog.Errorf("unmarshal scheduler config %s failed: %v", defaultSchedulerConf, err)
panic("invalid default configuration")
}
})
var config string
if len(pc.schedulerConf) != 0 {
if config, err = readSchedulerConf(pc.schedulerConf); err != nil {
klog.Errorf("Failed to read scheduler configuration '%s', using previous configuration: %v",
pc.schedulerConf, err)
return
}
}
actions, plugins, configurations, err := unmarshalSchedulerConf(config)
if err != nil {
klog.Errorf("scheduler config %s is invalid: %v", config, err)
return
}
pc.mutex.Lock()
// If it is valid, use the new configuration
pc.actions = actions
pc.plugins = plugins
pc.configurations = configurations
pc.mutex.Unlock()
}
func (pc *Scheduler) watchSchedulerConf(stopCh <-chan struct{}) {
if pc.fileWatcher == nil {
return
}
eventCh := pc.fileWatcher.Events()
errCh := pc.fileWatcher.Errors()
for {
select {
case event, ok := <-eventCh:
if !ok {
return
}
klog.V(4).Infof("watch %s event: %v", pc.schedulerConf, event)
if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
pc.loadSchedulerConf()
}
case err, ok := <-errCh:
if !ok {
return
}
klog.Infof("watch %s error: %v", pc.schedulerConf, err)
case <-stopCh:
return
}
}
}
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"io/ioutil"
"strings"
"gopkg.in/yaml.v2"
"volcano.sh/volcano/pkg/scheduler/conf"
"volcano.sh/volcano/pkg/scheduler/framework"
"volcano.sh/volcano/pkg/scheduler/plugins"
)
var defaultSchedulerConf = `
actions: "enqueue, allocate, backfill"
tiers:
- plugins:
- name: priority
- name: gang
- name: conformance
- plugins:
- name: overcommit
- name: drf
- name: predicates
- name: proportion
- name: nodeorder
`
func unmarshalSchedulerConf(confStr string) ([]framework.Action, []conf.Tier, []conf.Configuration, error) {
var actions []framework.Action
schedulerConf := &conf.SchedulerConfiguration{}
if err := yaml.Unmarshal([]byte(confStr), schedulerConf); err != nil {
return nil, nil, nil, err
}
// Set default settings for each plugin if not set
for i, tier := range schedulerConf.Tiers {
// drf with hierarchy enabled
hdrf := false
// proportion enabled
proportion := false
for j := range tier.Plugins {
if tier.Plugins[j].Name == "drf" &&
tier.Plugins[j].EnabledHierarchy != nil &&
*tier.Plugins[j].EnabledHierarchy {
hdrf = true
}
if tier.Plugins[j].Name == "proportion" {
proportion = true
}
plugins.ApplyPluginConfDefaults(&schedulerConf.Tiers[i].Plugins[j])
}
if hdrf && proportion {
return nil, nil, nil, fmt.Errorf("proportion and drf with hierarchy enabled conflicts")
}
}
actionNames := strings.Split(schedulerConf.Actions, ",")
for _, actionName := range actionNames {
if action, found := framework.GetAction(strings.TrimSpace(actionName)); found {
actions = append(actions, action)
} else {
return nil, nil, nil, fmt.Errorf("failed to find Action %s, ignore it", actionName)
}
}
return actions, schedulerConf.Tiers, schedulerConf.Configurations, nil
}
func readSchedulerConf(confPath string) (string, error) {
dat, err := ioutil.ReadFile(confPath)
if err != nil {
return "", err
}
return string(dat), nil
}
package util
import (
"context"
"fmt"
"sync"
"sync/atomic"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
"volcano.sh/volcano/pkg/scheduler/api"
)
type PredicateHelper interface {
PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn) ([]*api.NodeInfo, *api.FitErrors)
}
type predicateHelper struct {
taskPredicateErrorCache map[string]map[string]error
}
// PredicateNodes returns the specified number of nodes that fit a task
func (ph *predicateHelper) PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn) ([]*api.NodeInfo, *api.FitErrors) {
var errorLock sync.RWMutex
fe := api.NewFitErrors()
allNodes := len(nodes)
if allNodes == 0 {
return make([]*api.NodeInfo, 0), fe
}
numNodesToFind := CalculateNumOfFeasibleNodesToFind(int32(allNodes))
//allocate enough space to avoid growing it
predicateNodes := make([]*api.NodeInfo, numNodesToFind)
numFoundNodes := int32(0)
processedNodes := int32(0)
taskGroupid := taskGroupID(task)
nodeErrorCache, taskFailedBefore := ph.taskPredicateErrorCache[taskGroupid]
if nodeErrorCache == nil {
nodeErrorCache = map[string]error{}
}
//create a context with cancellation
ctx, cancel := context.WithCancel(context.Background())
checkNode := func(index int) {
// Check the nodes starting from where is left off in the previous scheduling cycle,
// to make sure all nodes have the same chance of being examined across pods.
node := nodes[(lastProcessedNodeIndex+index)%allNodes]
atomic.AddInt32(&processedNodes, 1)
klog.V(4).Infof("Considering Task <%v/%v> on node <%v>: <%v> vs. <%v>",
task.Namespace, task.Name, node.Name, task.Resreq, node.Idle)
// Check if the task had "predicate" failure before.
// And then check if the task failed to predict on this node before.
if taskFailedBefore {
errorLock.RLock()
errC, ok := nodeErrorCache[node.Name]
errorLock.RUnlock()
if ok {
errorLock.Lock()
fe.SetNodeError(node.Name, errC)
errorLock.Unlock()
return
}
}
// TODO (k82cn): Enable eCache for performance improvement.
if err := fn(task, node); err != nil {
klog.V(3).Infof("Predicates failed for task <%s/%s> on node <%s>: %v",
task.Namespace, task.Name, node.Name, err)
errorLock.Lock()
nodeErrorCache[node.Name] = err
ph.taskPredicateErrorCache[taskGroupid] = nodeErrorCache
fe.SetNodeError(node.Name, err)
errorLock.Unlock()
return
}
//check if the number of found nodes is more than the numNodesTofind
length := atomic.AddInt32(&numFoundNodes, 1)
if length > numNodesToFind {
cancel()
atomic.AddInt32(&numFoundNodes, -1)
} else {
predicateNodes[length-1] = node
}
}
//workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), checkNode)
workqueue.ParallelizeUntil(ctx, 16, allNodes, checkNode)
//processedNodes := int(numFoundNodes) + len(filteredNodesStatuses) + len(failedPredicateMap)
lastProcessedNodeIndex = (lastProcessedNodeIndex + int(processedNodes)) % allNodes
predicateNodes = predicateNodes[:numFoundNodes]
return predicateNodes, fe
}
func taskGroupID(task *api.TaskInfo) string {
return fmt.Sprintf("%s/%s", task.Job, task.GetTaskSpecKey())
}
func NewPredicateHelper() PredicateHelper {
return &predicateHelper{taskPredicateErrorCache: map[string]map[string]error{}}
}
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"container/heap"
"volcano.sh/volcano/pkg/scheduler/api"
)
//PriorityQueue implements a scheduling queue.
type PriorityQueue struct {
queue priorityQueue
}
type priorityQueue struct {
items []interface{}
lessFn api.LessFn
}
// NewPriorityQueue returns a PriorityQueue
func NewPriorityQueue(lessFn api.LessFn) *PriorityQueue {
return &PriorityQueue{
queue: priorityQueue{
items: make([]interface{}, 0),
lessFn: lessFn,
},
}
}
// Push pushes element in the priority Queue
func (q *PriorityQueue) Push(it interface{}) {
heap.Push(&q.queue, it)
}
// Pop pops element in the priority Queue
func (q *PriorityQueue) Pop() interface{} {
if q.Len() == 0 {
return nil
}
return heap.Pop(&q.queue)
}
// Empty check if queue is empty
func (q *PriorityQueue) Empty() bool {
return q.queue.Len() == 0
}
// Len returns Len of the priority queue
func (q *PriorityQueue) Len() int {
return q.queue.Len()
}
func (pq *priorityQueue) Len() int { return len(pq.items) }
func (pq *priorityQueue) Less(i, j int) bool {
if pq.lessFn == nil {
return i < j
}
// We want Pop to give us the highest, not lowest, priority so we use greater than here.
return pq.lessFn(pq.items[i], pq.items[j])
}
func (pq priorityQueue) Swap(i, j int) {
pq.items[i], pq.items[j] = pq.items[j], pq.items[i]
}
func (pq *priorityQueue) Push(x interface{}) {
(*pq).items = append((*pq).items, x)
}
func (pq *priorityQueue) Pop() interface{} {
old := (*pq).items
n := len(old)
item := old[n-1]
(*pq).items = old[0 : n-1]
return item
}
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"context"
"fmt"
"math"
"math/rand"
"sort"
"sync"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
"volcano.sh/volcano/cmd/scheduler/app/options"
"volcano.sh/volcano/pkg/scheduler/api"
)
const baselinePercentageOfNodesToFind = 50
var lastProcessedNodeIndex int
// Reservation is used to record target job and locked nodes
var Reservation *ResourceReservation
func init() {
Reservation = NewResourceReservation()
}
// CalculateNumOfFeasibleNodesToFind returns the number of feasible nodes that once found,
// the scheduler stops its search for more feasible nodes.
func CalculateNumOfFeasibleNodesToFind(numAllNodes int32) (numNodes int32) {
opts := options.ServerOpts
if numAllNodes <= opts.MinNodesToFind || opts.PercentageOfNodesToFind >= 100 {
return numAllNodes
}
adaptivePercentage := opts.PercentageOfNodesToFind
if adaptivePercentage <= 0 {
adaptivePercentage = baselinePercentageOfNodesToFind - numAllNodes/125
if adaptivePercentage < opts.MinPercentageOfNodesToFind {
adaptivePercentage = opts.MinPercentageOfNodesToFind
}
}
numNodes = numAllNodes * adaptivePercentage / 100
if numNodes < opts.MinNodesToFind {
numNodes = opts.MinNodesToFind
}
return numNodes
}
// PrioritizeNodes returns a map whose key is node's score and value are corresponding nodes
func PrioritizeNodes(task *api.TaskInfo, nodes []*api.NodeInfo, batchFn api.BatchNodeOrderFn, mapFn api.NodeOrderMapFn, reduceFn api.NodeOrderReduceFn) map[float64][]*api.NodeInfo {
pluginNodeScoreMap := map[string]k8sframework.NodeScoreList{}
nodeOrderScoreMap := map[string]float64{}
nodeScores := map[float64][]*api.NodeInfo{}
var workerLock sync.Mutex
scoreNode := func(index int) {
node := nodes[index]
mapScores, orderScore, err := mapFn(task, node)
if err != nil {
klog.Errorf("Error in Calculating Priority for the node:%v", err)
return
}
workerLock.Lock()
for plugin, score := range mapScores {
nodeScoreMap, ok := pluginNodeScoreMap[plugin]
if !ok {
nodeScoreMap = k8sframework.NodeScoreList{}
}
hp := k8sframework.NodeScore{}
hp.Name = node.Name
hp.Score = int64(math.Floor(score))
pluginNodeScoreMap[plugin] = append(nodeScoreMap, hp)
}
nodeOrderScoreMap[node.Name] = orderScore
workerLock.Unlock()
}
workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), scoreNode)
reduceScores, err := reduceFn(task, pluginNodeScoreMap)
if err != nil {
klog.Errorf("Error in Calculating Priority for the node:%v", err)
return nodeScores
}
batchNodeScore, err := batchFn(task, nodes)
if err != nil {
klog.Errorf("Error in Calculating batch Priority for the node, err %v", err)
return nodeScores
}
for _, node := range nodes {
if score, found := reduceScores[node.Name]; found {
if orderScore, ok := nodeOrderScoreMap[node.Name]; ok {
score += orderScore
}
if batchScore, ok := batchNodeScore[node.Name]; ok {
score += batchScore
}
nodeScores[score] = append(nodeScores[score], node)
} else {
// If no plugin is applied to this node, the default is 0.0
score = 0.0
if orderScore, ok := nodeOrderScoreMap[node.Name]; ok {
score += orderScore
}
if batchScore, ok := batchNodeScore[node.Name]; ok {
score += batchScore
}
nodeScores[score] = append(nodeScores[score], node)
}
}
return nodeScores
}
// SortNodes returns nodes by order of score
func SortNodes(nodeScores map[float64][]*api.NodeInfo) []*api.NodeInfo {
var nodesInorder []*api.NodeInfo
var keys []float64
for key := range nodeScores {
keys = append(keys, key)
}
sort.Sort(sort.Reverse(sort.Float64Slice(keys)))
for _, key := range keys {
nodes := nodeScores[key]
nodesInorder = append(nodesInorder, nodes...)
}
return nodesInorder
}
// SelectBestNode returns best node whose score is highest, pick one randomly if there are many nodes with same score.
func SelectBestNode(nodeScores map[float64][]*api.NodeInfo) *api.NodeInfo {
var bestNodes []*api.NodeInfo
maxScore := -1.0
for score, nodes := range nodeScores {
if score > maxScore {
maxScore = score
bestNodes = nodes
}
}
if len(bestNodes) == 0 {
return nil
}
return bestNodes[rand.Intn(len(bestNodes))]
}
// GetNodeList returns values of the map 'nodes'
func GetNodeList(nodes map[string]*api.NodeInfo, nodeList []string) []*api.NodeInfo {
result := make([]*api.NodeInfo, 0, len(nodeList))
for _, nodename := range nodeList {
if ni, ok := nodes[nodename]; ok {
result = append(result, ni)
}
}
return result
}
// ValidateVictims returns an error if the resources of the victims can't satisfy the preemptor
func ValidateVictims(preemptor *api.TaskInfo, node *api.NodeInfo, victims []*api.TaskInfo) error {
if len(victims) == 0 {
return fmt.Errorf("no victims")
}
futureIdle := node.FutureIdle()
for _, victim := range victims {
futureIdle.Add(victim.Resreq)
}
// Every resource of the preemptor needs to be less or equal than corresponding
// idle resource after preemption.
if !preemptor.InitResreq.LessEqual(futureIdle, api.Zero) {
return fmt.Errorf("not enough resources: requested <%v>, but future idle <%v>",
preemptor.InitResreq, futureIdle)
}
return nil
}
// ResourceReservation is struct used for resource reservation
type ResourceReservation struct {
TargetJob *api.JobInfo
LockedNodes map[string]*api.NodeInfo
}
// NewResourceReservation is used to create global instance
func NewResourceReservation() *ResourceReservation {
return &ResourceReservation{
TargetJob: nil,
LockedNodes: map[string]*api.NodeInfo{},
}
}
// GetMinInt return minimum int from vals
func GetMinInt(vals ...int) int {
if len(vals) == 0 {
return 0
}
min := vals[0]
for _, val := range vals {
if val <= min {
min = val
}
}
return min
}
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"fmt"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/scheduler/api"
)
// BuildResourceList builts resource list object
func BuildResourceList(cpu string, memory string) v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: resource.MustParse(cpu),
v1.ResourceMemory: resource.MustParse(memory),
api.GPUResourceName: resource.MustParse("0"),
}
}
// BuildResourceListWithGPU builts resource list with GPU
func BuildResourceListWithGPU(cpu string, memory string, GPU string) v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: resource.MustParse(cpu),
v1.ResourceMemory: resource.MustParse(memory),
api.GPUResourceName: resource.MustParse(GPU),
}
}
// BuildNode builts node object
func BuildNode(name string, alloc v1.ResourceList, labels map[string]string) *v1.Node {
return &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: labels,
Annotations: map[string]string{},
},
Status: v1.NodeStatus{
Capacity: alloc,
Allocatable: alloc,
},
}
}
// BuildPod builts Pod object
func BuildPod(namespace, name, nodename string, p v1.PodPhase, req v1.ResourceList, groupName string, labels map[string]string, selector map[string]string) *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: types.UID(fmt.Sprintf("%v-%v", namespace, name)),
Name: name,
Namespace: namespace,
Labels: labels,
Annotations: map[string]string{
schedulingv2.KubeGroupNameAnnotationKey: groupName,
},
},
Status: v1.PodStatus{
Phase: p,
},
Spec: v1.PodSpec{
NodeName: nodename,
NodeSelector: selector,
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: req,
},
},
},
},
}
}
// FakeBinder is used as fake binder
type FakeBinder struct {
Binds map[string]string
Channel chan string
}
// Bind used by fake binder struct to bind pods
func (fb *FakeBinder) Bind(kubeClient *kubernetes.Clientset, tasks []*api.TaskInfo) (error, []*api.TaskInfo) {
for _, p := range tasks {
key := fmt.Sprintf("%v/%v", p.Namespace, p.Name)
fb.Binds[key] = p.NodeName
}
return nil, nil
}
// FakeEvictor is used as fake evictor
type FakeEvictor struct {
sync.Mutex
evicts []string
Channel chan string
}
// Evicts returns copy of evicted pods.
func (fe *FakeEvictor) Evicts() []string {
fe.Lock()
defer fe.Unlock()
return append([]string{}, fe.evicts...)
}
// Evict is used by fake evictor to evict pods
func (fe *FakeEvictor) Evict(p *v1.Pod, reason string) error {
fe.Lock()
defer fe.Unlock()
fmt.Println("PodName: ", p.Name)
key := fmt.Sprintf("%v/%v", p.Namespace, p.Name)
fe.evicts = append(fe.evicts, key)
fe.Channel <- key
return nil
}
// FakeStatusUpdater is used for fake status update
type FakeStatusUpdater struct {
}
// UpdatePodCondition is a empty function
func (ftsu *FakeStatusUpdater) UpdatePodCondition(pod *v1.Pod, podCondition *v1.PodCondition) (*v1.Pod, error) {
// do nothing here
return nil, nil
}
// UpdatePodGroup is a empty function
func (ftsu *FakeStatusUpdater) UpdatePodGroup(pg *api.PodGroup) (*api.PodGroup, error) {
// do nothing here
return nil, nil
}
// FakeVolumeBinder is used as fake volume binder
type FakeVolumeBinder struct {
}
// AllocateVolumes is a empty function
func (fvb *FakeVolumeBinder) AllocateVolumes(task *api.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
return nil
}
// BindVolumes is a empty function
func (fvb *FakeVolumeBinder) BindVolumes(task *api.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
return nil
}
// GetPodVolumes is a empty function
func (fvb *FakeVolumeBinder) GetPodVolumes(task *api.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
return nil, nil
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mutate
import (
"encoding/json"
"fmt"
"strconv"
"k8s.io/api/admission/v1beta1"
whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
"volcano.sh/volcano/pkg/webhooks/router"
"volcano.sh/volcano/pkg/webhooks/schema"
"volcano.sh/volcano/pkg/webhooks/util"
)
const (
// DefaultQueue constant stores the name of the queue as "default"
DefaultQueue = "default"
// DefaultMaxRetry is the default number of retries.
DefaultMaxRetry = 3
defaultSchedulerName = "volcano"
defaultMaxRetry int32 = 3
)
func init() {
router.RegisterAdmission(service)
}
var service = &router.AdmissionService{
Path: "/jobs/mutate",
Func: Jobs,
MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
Webhooks: []whv1beta1.MutatingWebhook{{
Name: "mutatejob.volcano.sh",
Rules: []whv1beta1.RuleWithOperations{
{
Operations: []whv1beta1.OperationType{whv1beta1.Create},
Rule: whv1beta1.Rule{
APIGroups: []string{"batch.volcano.sh"},
APIVersions: []string{"v1alpha1"},
Resources: []string{"jobs"},
},
},
},
}},
},
}
type patchOperation struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}
// Jobs mutate jobs.
func Jobs(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
klog.V(3).Infof("mutating jobs")
job, err := schema.DecodeJob(ar.Request.Object, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
var patchBytes []byte
switch ar.Request.Operation {
case v1beta1.Create:
patchBytes, _ = createPatch(job)
default:
err = fmt.Errorf("expect operation to be 'CREATE' ")
return util.ToAdmissionResponse(err)
}
klog.V(3).Infof("AdmissionResponse: patch=%v", string(patchBytes))
reviewResponse := v1beta1.AdmissionResponse{
Allowed: true,
Patch: patchBytes,
}
pt := v1beta1.PatchTypeJSONPatch
reviewResponse.PatchType = &pt
return &reviewResponse
}
func createPatch(job *v1alpha1.Job) ([]byte, error) {
var patch []patchOperation
pathQueue := patchDefaultQueue(job)
if pathQueue != nil {
patch = append(patch, *pathQueue)
}
pathScheduler := patchDefaultScheduler(job)
if pathScheduler != nil {
patch = append(patch, *pathScheduler)
}
pathMaxRetry := patchDefaultMaxRetry(job)
if pathMaxRetry != nil {
patch = append(patch, *pathMaxRetry)
}
pathSpec := mutateSpec(job.Spec.Tasks, "/spec/tasks")
if pathSpec != nil {
patch = append(patch, *pathSpec)
}
pathMinAvailable := patchDefaultMinAvailable(job)
if pathMinAvailable != nil {
patch = append(patch, *pathMinAvailable)
}
// Add default plugins for some distributed-framework plugin cases
patchPlugins := patchDefaultPlugins(job)
if patchPlugins != nil {
patch = append(patch, *patchPlugins)
}
return json.Marshal(patch)
}
func patchDefaultQueue(job *v1alpha1.Job) *patchOperation {
//Add default queue if not specified.
if job.Spec.Queue == "" {
return &patchOperation{Op: "add", Path: "/spec/queue", Value: DefaultQueue}
}
return nil
}
func patchDefaultScheduler(job *v1alpha1.Job) *patchOperation {
// Add default scheduler name if not specified.
if job.Spec.SchedulerName == "" {
return &patchOperation{Op: "add", Path: "/spec/schedulerName", Value: defaultSchedulerName}
}
return nil
}
func patchDefaultMaxRetry(job *v1alpha1.Job) *patchOperation {
// Add default maxRetry if maxRetry is zero.
if job.Spec.MaxRetry == 0 {
return &patchOperation{Op: "add", Path: "/spec/maxRetry", Value: DefaultMaxRetry}
}
return nil
}
func patchDefaultMinAvailable(job *v1alpha1.Job) *patchOperation {
// Add default minAvailable if minAvailable is zero.
if job.Spec.MinAvailable == 0 {
var jobMinAvailable int32
for _, task := range job.Spec.Tasks {
if task.MinAvailable != nil {
jobMinAvailable += *task.MinAvailable
} else {
jobMinAvailable += task.Replicas
}
}
return &patchOperation{Op: "add", Path: "/spec/minAvailable", Value: jobMinAvailable}
}
return nil
}
func mutateSpec(tasks []v1alpha1.TaskSpec, basePath string) *patchOperation {
patched := false
for index := range tasks {
// add default task name
taskName := tasks[index].Name
if len(taskName) == 0 {
patched = true
tasks[index].Name = v1alpha1.DefaultTaskSpec + strconv.Itoa(index)
}
if tasks[index].Template.Spec.HostNetwork && tasks[index].Template.Spec.DNSPolicy == "" {
patched = true
tasks[index].Template.Spec.DNSPolicy = v1.DNSClusterFirstWithHostNet
}
if tasks[index].MinAvailable == nil {
patched = true
minAvailable := tasks[index].Replicas
tasks[index].MinAvailable = &minAvailable
}
if tasks[index].MaxRetry == 0 {
patched = true
tasks[index].MaxRetry = defaultMaxRetry
}
}
if !patched {
return nil
}
return &patchOperation{
Op: "replace",
Path: basePath,
Value: tasks,
}
}
func patchDefaultPlugins(job *v1alpha1.Job) *patchOperation {
if job.Spec.Plugins == nil {
return nil
}
plugins := map[string][]string{}
for k, v := range job.Spec.Plugins {
plugins[k] = v
}
// Because the tensorflow-plugin depends on svc-plugin.
// If the svc-plugin is not defined, we should add it.
if _, ok := job.Spec.Plugins["tensorflow"]; ok {
if _, ok := plugins["svc"]; !ok {
plugins["svc"] = []string{}
}
}
return &patchOperation{
Op: "replace",
Path: "/spec/plugins",
Value: plugins,
}
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package validate
import (
"context"
"fmt"
"strings"
"k8s.io/api/admission/v1beta1"
whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/validation"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/klog"
k8score "k8s.io/kubernetes/pkg/apis/core"
k8scorev1 "k8s.io/kubernetes/pkg/apis/core/v1"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
k8scorevalid "k8s.io/kubernetes/pkg/apis/core/validation"
"volcano.sh/apis/pkg/apis/batch/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
"volcano.sh/volcano/pkg/controllers/job/plugins"
"volcano.sh/volcano/pkg/webhooks/router"
"volcano.sh/volcano/pkg/webhooks/schema"
"volcano.sh/volcano/pkg/webhooks/util"
)
func init() {
router.RegisterAdmission(service)
}
var service = &router.AdmissionService{
Path: "/jobs/validate",
Func: AdmitJobs,
Config: config,
ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
Webhooks: []whv1beta1.ValidatingWebhook{{
Name: "validatejob.volcano.sh",
Rules: []whv1beta1.RuleWithOperations{
{
Operations: []whv1beta1.OperationType{whv1beta1.Create, whv1beta1.Update},
Rule: whv1beta1.Rule{
APIGroups: []string{"batch.volcano.sh"},
APIVersions: []string{"v1alpha1"},
Resources: []string{"jobs"},
},
},
},
}},
},
}
var config = &router.AdmissionServiceConfig{}
// AdmitJobs is to admit jobs and return response.
func AdmitJobs(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
klog.V(3).Infof("admitting jobs -- %s", ar.Request.Operation)
job, err := schema.DecodeJob(ar.Request.Object, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
var msg string
reviewResponse := v1beta1.AdmissionResponse{}
reviewResponse.Allowed = true
switch ar.Request.Operation {
case v1beta1.Create:
msg = validateJobCreate(job, &reviewResponse)
case v1beta1.Update:
oldJob, err := schema.DecodeJob(ar.Request.OldObject, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
err = validateJobUpdate(oldJob, job)
if err != nil {
return util.ToAdmissionResponse(err)
}
default:
err := fmt.Errorf("expect operation to be 'CREATE' or 'UPDATE'")
return util.ToAdmissionResponse(err)
}
if !reviewResponse.Allowed {
reviewResponse.Result = &metav1.Status{Message: strings.TrimSpace(msg)}
}
return &reviewResponse
}
func validateJobCreate(job *v1alpha1.Job, reviewResponse *v1beta1.AdmissionResponse) string {
var msg string
taskNames := map[string]string{}
var totalReplicas int32
if job.Spec.MinAvailable < 0 {
reviewResponse.Allowed = false
return "job 'minAvailable' must be >= 0."
}
if job.Spec.MaxRetry < 0 {
reviewResponse.Allowed = false
return "'maxRetry' cannot be less than zero."
}
if job.Spec.TTLSecondsAfterFinished != nil && *job.Spec.TTLSecondsAfterFinished < 0 {
reviewResponse.Allowed = false
return "'ttlSecondsAfterFinished' cannot be less than zero."
}
if len(job.Spec.Tasks) == 0 {
reviewResponse.Allowed = false
return "No task specified in job spec"
}
hasDependenciesBetweenTasks := false
for index, task := range job.Spec.Tasks {
if task.DependsOn != nil {
hasDependenciesBetweenTasks = true
}
if task.Replicas < 0 {
msg += fmt.Sprintf(" 'replicas' < 0 in task: %s;", task.Name)
}
if task.MinAvailable != nil && *task.MinAvailable > task.Replicas {
msg += fmt.Sprintf(" 'minAvailable' is greater than 'replicas' in task: %s, job: %s", task.Name, job.Name)
}
// count replicas
totalReplicas += task.Replicas
// validate task name
if errMsgs := validation.IsDNS1123Label(task.Name); len(errMsgs) > 0 {
msg += fmt.Sprintf(" %v;", errMsgs)
}
// duplicate task name
if _, found := taskNames[task.Name]; found {
msg += fmt.Sprintf(" duplicated task name %s;", task.Name)
break
} else {
taskNames[task.Name] = task.Name
}
if err := validatePolicies(task.Policies, field.NewPath("spec.tasks.policies")); err != nil {
msg += err.Error() + fmt.Sprintf(" valid events are %v, valid actions are %v",
getValidEvents(), getValidActions())
}
podName := jobhelpers.MakePodName(job.Name, task.Name, index)
msg += validateK8sPodNameLength(podName)
msg += validateTaskTemplate(task, job, index)
}
msg += validateJobName(job)
if totalReplicas < job.Spec.MinAvailable {
msg += "job 'minAvailable' should not be greater than total replicas in tasks;"
}
if err := validatePolicies(job.Spec.Policies, field.NewPath("spec.policies")); err != nil {
msg = msg + err.Error() + fmt.Sprintf(" valid events are %v, valid actions are %v;",
getValidEvents(), getValidActions())
}
// invalid job plugins
if len(job.Spec.Plugins) != 0 {
for name := range job.Spec.Plugins {
if _, found := plugins.GetPluginBuilder(name); !found {
msg += fmt.Sprintf(" unable to find job plugin: %s", name)
}
}
}
if err := validateIO(job.Spec.Volumes); err != nil {
msg += err.Error()
}
queue, err := config.VolcanoClient.SchedulingV1beta1().Queues().Get(context.TODO(), job.Spec.Queue, metav1.GetOptions{})
if err != nil {
msg += fmt.Sprintf(" unable to find job queue: %v", err)
} else if queue.Status.State != schedulingv1beta1.QueueStateOpen {
msg += fmt.Sprintf("can only submit job to queue with state `Open`, "+
"queue `%s` status is `%s`", queue.Name, queue.Status.State)
}
if hasDependenciesBetweenTasks {
_, isDag := topoSort(job)
if !isDag {
msg += fmt.Sprintf("job has dependencies between tasks, but doesn't form a directed acyclic graph(DAG)")
}
}
if msg != "" {
reviewResponse.Allowed = false
}
return msg
}
func validateJobUpdate(old, new *v1alpha1.Job) error {
var totalReplicas int32
for _, task := range new.Spec.Tasks {
if task.Replicas < 0 {
return fmt.Errorf("'replicas' must be >= 0 in task: %s", task.Name)
}
if task.MinAvailable != nil && *task.MinAvailable > task.Replicas {
return fmt.Errorf("'minAvailable' must be <= 'replicas' in task: %s;", task.Name)
}
// count replicas
totalReplicas += task.Replicas
}
if new.Spec.MinAvailable > totalReplicas {
return fmt.Errorf("job 'minAvailable' must not be greater than total replicas")
}
if new.Spec.MinAvailable < 0 {
return fmt.Errorf("job 'minAvailable' must be >= 0")
}
if len(old.Spec.Tasks) != len(new.Spec.Tasks) {
return fmt.Errorf("job updates may not add or remove tasks")
}
// other fields under spec are not allowed to mutate
new.Spec.MinAvailable = old.Spec.MinAvailable
new.Spec.PriorityClassName = old.Spec.PriorityClassName
for i := range new.Spec.Tasks {
new.Spec.Tasks[i].Replicas = old.Spec.Tasks[i].Replicas
new.Spec.Tasks[i].MinAvailable = old.Spec.Tasks[i].MinAvailable
}
// job controller will update the pvc name if not provided
for i := range new.Spec.Volumes {
if new.Spec.Volumes[i].VolumeClaim != nil {
new.Spec.Volumes[i].VolumeClaimName = ""
}
}
for i := range old.Spec.Volumes {
if old.Spec.Volumes[i].VolumeClaim != nil {
old.Spec.Volumes[i].VolumeClaimName = ""
}
}
if !apiequality.Semantic.DeepEqual(new.Spec, old.Spec) {
return fmt.Errorf("job updates may not change fields other than `minAvailable`, `tasks[*].replicas under spec`")
}
return nil
}
func validateTaskTemplate(task v1alpha1.TaskSpec, job *v1alpha1.Job, index int) string {
var v1PodTemplate v1.PodTemplate
v1PodTemplate.Template = *task.Template.DeepCopy()
k8scorev1.SetObjectDefaults_PodTemplate(&v1PodTemplate)
var coreTemplateSpec k8score.PodTemplateSpec
k8scorev1.Convert_v1_PodTemplateSpec_To_core_PodTemplateSpec(&v1PodTemplate.Template, &coreTemplateSpec, nil)
// Skip verify container SecurityContex.Privileged as it depends on
// the kube-apiserver `allow-privileged` flag.
for i, container := range coreTemplateSpec.Spec.Containers {
if container.SecurityContext != nil && container.SecurityContext.Privileged != nil {
coreTemplateSpec.Spec.Containers[i].SecurityContext.Privileged = nil
}
}
corePodTemplate := k8score.PodTemplate{
ObjectMeta: metav1.ObjectMeta{
Name: task.Name,
Namespace: job.Namespace,
},
Template: coreTemplateSpec,
}
if allErrs := k8scorevalid.ValidatePodTemplate(&corePodTemplate); len(allErrs) > 0 {
msg := fmt.Sprintf("spec.task[%d].", index)
for index := range allErrs {
msg += allErrs[index].Error() + ". "
}
return msg
}
msg := validateTaskTopoPolicy(task, index)
if msg != "" {
return msg
}
return ""
}
func validateK8sPodNameLength(podName string) string {
if errMsgs := validation.IsQualifiedName(podName); len(errMsgs) > 0 {
return fmt.Sprintf("create pod with name %s validate failed %v;", podName, errMsgs)
}
return ""
}
func validateJobName(job *v1alpha1.Job) string {
if errMsgs := validation.IsQualifiedName(job.Name); len(errMsgs) > 0 {
return fmt.Sprintf("create job with name %s validate failed %v", job.Name, errMsgs)
}
return ""
}
func validateTaskTopoPolicy(task v1alpha1.TaskSpec, index int) string {
if task.TopologyPolicy == "" || task.TopologyPolicy == v1alpha1.None {
return ""
}
template := task.Template.DeepCopy()
for id, container := range template.Spec.Containers {
if len(container.Resources.Requests) == 0 {
template.Spec.Containers[id].Resources.Requests = container.Resources.Limits.DeepCopy()
}
}
for id, container := range template.Spec.InitContainers {
if len(container.Resources.Requests) == 0 {
template.Spec.InitContainers[id].Resources.Requests = container.Resources.Limits.DeepCopy()
}
}
pod := &v1.Pod{
Spec: template.Spec,
}
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
return fmt.Sprintf("spec.task[%d] isn't Guaranteed pod, kind=%v", index, v1qos.GetPodQOS(pod))
}
for id, container := range append(template.Spec.Containers, template.Spec.InitContainers...) {
requestNum := guaranteedCPUs(container)
if requestNum == 0 {
return fmt.Sprintf("the cpu request isn't an integer in spec.task[%d] container[%d].",
index, id)
}
}
return ""
}
func guaranteedCPUs(container v1.Container) int {
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
return int(cpuQuantity.Value())
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package validate
import (
"fmt"
"github.com/hashicorp/go-multierror"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/kubernetes/pkg/apis/core/validation"
batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
)
// policyEventMap defines all policy events and whether to allow external use.
var policyEventMap = map[busv1alpha1.Event]bool{
busv1alpha1.AnyEvent: true,
busv1alpha1.PodFailedEvent: true,
busv1alpha1.PodEvictedEvent: true,
busv1alpha1.JobUnknownEvent: true,
busv1alpha1.TaskCompletedEvent: true,
busv1alpha1.TaskFailedEvent: true,
busv1alpha1.OutOfSyncEvent: false,
busv1alpha1.CommandIssuedEvent: false,
busv1alpha1.JobUpdatedEvent: true,
}
// policyActionMap defines all policy actions and whether to allow external use.
var policyActionMap = map[busv1alpha1.Action]bool{
busv1alpha1.AbortJobAction: true,
busv1alpha1.RestartJobAction: true,
busv1alpha1.RestartTaskAction: true,
busv1alpha1.TerminateJobAction: true,
busv1alpha1.CompleteJobAction: true,
busv1alpha1.ResumeJobAction: true,
busv1alpha1.SyncJobAction: false,
busv1alpha1.EnqueueAction: false,
busv1alpha1.SyncQueueAction: false,
busv1alpha1.OpenQueueAction: false,
busv1alpha1.CloseQueueAction: false,
}
func validatePolicies(policies []batchv1alpha1.LifecyclePolicy, fldPath *field.Path) error {
var err error
policyEvents := map[busv1alpha1.Event]struct{}{}
exitCodes := map[int32]struct{}{}
for _, policy := range policies {
if (policy.Event != "" || len(policy.Events) != 0) && policy.ExitCode != nil {
err = multierror.Append(err, fmt.Errorf("must not specify event and exitCode simultaneously"))
break
}
if policy.Event == "" && len(policy.Events) == 0 && policy.ExitCode == nil {
err = multierror.Append(err, fmt.Errorf("either event and exitCode should be specified"))
break
}
if len(policy.Event) != 0 || len(policy.Events) != 0 {
bFlag := false
policyEventsList := getEventList(policy)
for _, event := range policyEventsList {
if allow, ok := policyEventMap[event]; !ok || !allow {
err = multierror.Append(err, field.Invalid(fldPath, event, "invalid policy event"))
bFlag = true
break
}
if allow, ok := policyActionMap[policy.Action]; !ok || !allow {
err = multierror.Append(err, field.Invalid(fldPath, policy.Action, "invalid policy action"))
bFlag = true
break
}
if _, found := policyEvents[event]; found {
err = multierror.Append(err, fmt.Errorf("duplicate event %v across different policy", event))
bFlag = true
break
} else {
policyEvents[event] = struct{}{}
}
}
if bFlag {
break
}
} else {
if *policy.ExitCode == 0 {
err = multierror.Append(err, fmt.Errorf("0 is not a valid error code"))
break
}
if _, found := exitCodes[*policy.ExitCode]; found {
err = multierror.Append(err, fmt.Errorf("duplicate exitCode %v", *policy.ExitCode))
break
} else {
exitCodes[*policy.ExitCode] = struct{}{}
}
}
}
if _, found := policyEvents[busv1alpha1.AnyEvent]; found && len(policyEvents) > 1 {
err = multierror.Append(err, fmt.Errorf("if there's * here, no other policy should be here"))
}
return err
}
func getEventList(policy batchv1alpha1.LifecyclePolicy) []busv1alpha1.Event {
policyEventsList := policy.Events
if len(policy.Event) > 0 {
policyEventsList = append(policyEventsList, policy.Event)
}
uniquePolicyEventlist := removeDuplicates(policyEventsList)
return uniquePolicyEventlist
}
func removeDuplicates(eventList []busv1alpha1.Event) []busv1alpha1.Event {
keys := make(map[busv1alpha1.Event]bool)
list := []busv1alpha1.Event{}
for _, val := range eventList {
if _, value := keys[val]; !value {
keys[val] = true
list = append(list, val)
}
}
return list
}
func getValidEvents() []busv1alpha1.Event {
var events []busv1alpha1.Event
for e, allow := range policyEventMap {
if allow {
events = append(events, e)
}
}
return events
}
func getValidActions() []busv1alpha1.Action {
var actions []busv1alpha1.Action
for a, allow := range policyActionMap {
if allow {
actions = append(actions, a)
}
}
return actions
}
// validateIO validates IO configuration.
func validateIO(volumes []batchv1alpha1.VolumeSpec) error {
volumeMap := map[string]bool{}
for _, volume := range volumes {
if len(volume.MountPath) == 0 {
return fmt.Errorf(" mountPath is required;")
}
if _, found := volumeMap[volume.MountPath]; found {
return fmt.Errorf(" duplicated mountPath: %s;", volume.MountPath)
}
if volume.VolumeClaim == nil && volume.VolumeClaimName == "" {
return fmt.Errorf(" either VolumeClaim or VolumeClaimName must be specified;")
}
if len(volume.VolumeClaimName) != 0 {
if volume.VolumeClaim != nil {
return fmt.Errorf("conflict: If you want to use an existing PVC, just specify VolumeClaimName." +
"If you want to create a new PVC, you do not need to specify VolumeClaimName")
}
if errMsgs := validation.ValidatePersistentVolumeName(volume.VolumeClaimName, false); len(errMsgs) > 0 {
return fmt.Errorf("invalid VolumeClaimName %s : %v", volume.VolumeClaimName, errMsgs)
}
}
volumeMap[volume.MountPath] = true
}
return nil
}
// topoSort uses topo sort to sort job tasks based on dependsOn field
// it will return an array contains all sorted task names and a bool which indicates whether it's a valid dag
func topoSort(job *batchv1alpha1.Job) ([]string, bool) {
graph, inDegree, taskList := makeGraph(job)
var taskStack []string
for task, degree := range inDegree {
if degree == 0 {
taskStack = append(taskStack, task)
}
}
sortedTasks := make([]string, 0)
for len(taskStack) > 0 {
length := len(taskStack)
var out string
out, taskStack = taskStack[length-1], taskStack[:length-1]
sortedTasks = append(sortedTasks, out)
for in, connected := range graph[out] {
if connected {
graph[out][in] = false
inDegree[in]--
if inDegree[in] == 0 {
taskStack = append(taskStack, in)
}
}
}
}
isDag := len(sortedTasks) == len(taskList)
if !isDag {
return nil, false
}
return sortedTasks, isDag
}
func makeGraph(job *batchv1alpha1.Job) (map[string]map[string]bool, map[string]int, []string) {
graph := make(map[string]map[string]bool)
inDegree := make(map[string]int)
taskList := make([]string, 0)
for _, task := range job.Spec.Tasks {
taskList = append(taskList, task.Name)
inDegree[task.Name] = 0
if task.DependsOn != nil {
for _, dependOnTask := range task.DependsOn.Name {
if graph[dependOnTask] == nil {
graph[dependOnTask] = make(map[string]bool)
}
graph[dependOnTask][task.Name] = true
inDegree[task.Name]++
}
}
}
return graph, inDegree, taskList
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mutate
import (
"github.com/imdario/mergo"
"gopkg.in/yaml.v2"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
wkconfig "volcano.sh/volcano/pkg/webhooks/config"
)
type annotationResGroup struct{}
const (
// defaultAnnotationKey: default annotation key
defaultAnnotationKey = "volcano.sh/resource-group"
)
// NewAnnotationResGroup create a new structure
func NewAnnotationResGroup() ResGroup {
return &annotationResGroup{}
}
// getAnnotation get annotations from the resource group
func getAnnotation(resGroupConfig wkconfig.ResGroupConfig) map[string]string {
annotations := make(map[string]string)
for _, val := range resGroupConfig.Object.Value {
tmp := make(map[string]string)
err := yaml.Unmarshal([]byte(val), &tmp)
if err != nil {
continue
}
if err := mergo.Merge(&annotations, &tmp); err != nil {
klog.Errorf("annotations merge failed, err=%v", err)
continue
}
}
return annotations
}
// IsBelongResGroup adjust whether pod is belong to the resource group
func (resGroup *annotationResGroup) IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool {
if resGroupConfig.Object.Key != "" && resGroupConfig.Object.Key != "annotation" {
return false
}
annotations := getAnnotation(resGroupConfig)
klog.V(3).Infof("annotations : %v", annotations)
for key, annotation := range annotations {
if pod.Annotations[key] == annotation {
return true
}
}
if resGroupConfig.Object.Key == "" && pod.Annotations[defaultAnnotationKey] == resGroupConfig.ResourceGroup {
return true
}
return false
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mutate
import (
v1 "k8s.io/api/core/v1"
wkconfig "volcano.sh/volcano/pkg/webhooks/config"
)
// ResGroup interface for resource group
type ResGroup interface {
IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool
}
// GetResGroup return the interface besed on resourceGroup.Object.Key
func GetResGroup(resourceGroup wkconfig.ResGroupConfig) ResGroup {
switch resourceGroup.Object.Key {
case "namespace":
return NewNamespaceResGroup()
case "annotation":
return NewAnnotationResGroup()
}
return NewAnnotationResGroup()
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mutate
import (
"encoding/json"
"fmt"
"k8s.io/api/admission/v1beta1"
whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
v1 "k8s.io/api/core/v1"
"k8s.io/klog"
wkconfig "volcano.sh/volcano/pkg/webhooks/config"
"volcano.sh/volcano/pkg/webhooks/router"
"volcano.sh/volcano/pkg/webhooks/schema"
"volcano.sh/volcano/pkg/webhooks/util"
)
// patchOperation define the patch operation structure
type patchOperation struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}
// init register mutate pod
func init() {
router.RegisterAdmission(service)
}
var service = &router.AdmissionService{
Path: "/pods/mutate",
Func: Pods,
Config: config,
MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
Webhooks: []whv1beta1.MutatingWebhook{{
Name: "mutatepod.volcano.sh",
Rules: []whv1beta1.RuleWithOperations{
{
Operations: []whv1beta1.OperationType{whv1beta1.Create},
Rule: whv1beta1.Rule{
APIGroups: []string{""},
APIVersions: []string{"v1"},
Resources: []string{"pods"},
},
},
},
}},
},
}
var config = &router.AdmissionServiceConfig{}
// Pods mutate pods.
func Pods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
klog.V(3).Infof("mutating pods -- %s", ar.Request.Operation)
pod, err := schema.DecodePod(ar.Request.Object, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
if pod.Namespace == "" {
pod.Namespace = ar.Request.Namespace
}
var patchBytes []byte
switch ar.Request.Operation {
case v1beta1.Create:
patchBytes, _ = createPatch(pod)
default:
err = fmt.Errorf("expect operation to be 'CREATE' ")
return util.ToAdmissionResponse(err)
}
reviewResponse := v1beta1.AdmissionResponse{
Allowed: true,
Patch: patchBytes,
}
pt := v1beta1.PatchTypeJSONPatch
reviewResponse.PatchType = &pt
return &reviewResponse
}
// createPatch patch pod
func createPatch(pod *v1.Pod) ([]byte, error) {
if config.ConfigData == nil {
klog.V(5).Infof("admission configuration is empty.")
return nil, nil
}
var patch []patchOperation
config.ConfigData.Lock()
defer config.ConfigData.Unlock()
for _, resourceGroup := range config.ConfigData.ResGroupsConfig {
klog.V(3).Infof("resourceGroup %s", resourceGroup.ResourceGroup)
group := GetResGroup(resourceGroup)
if !group.IsBelongResGroup(pod, resourceGroup) {
continue
}
patchLabel := patchLabels(pod, resourceGroup)
if patchLabel != nil {
patch = append(patch, *patchLabel)
}
patchToleration := patchTaintToleration(pod, resourceGroup)
if patchToleration != nil {
patch = append(patch, *patchToleration)
}
patchScheduler := patchSchedulerName(resourceGroup)
if patchScheduler != nil {
patch = append(patch, *patchScheduler)
}
klog.V(5).Infof("pod patch %v", patch)
return json.Marshal(patch)
}
return json.Marshal(patch)
}
// patchLabels patch label
func patchLabels(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
if len(resGroupConfig.Labels) == 0 {
return nil
}
nodeSelector := make(map[string]string)
for key, label := range pod.Spec.NodeSelector {
nodeSelector[key] = label
}
for key, label := range resGroupConfig.Labels {
nodeSelector[key] = label
}
return &patchOperation{Op: "add", Path: "/spec/nodeSelector", Value: nodeSelector}
}
// patchTaintToleration patch taint toleration
func patchTaintToleration(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
if len(resGroupConfig.Tolerations) == 0 {
return nil
}
var dst []v1.Toleration
dst = append(dst, pod.Spec.Tolerations...)
dst = append(dst, resGroupConfig.Tolerations...)
return &patchOperation{Op: "add", Path: "/spec/tolerations", Value: dst}
}
// patchSchedulerName patch scheduler
func patchSchedulerName(resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
if resGroupConfig.SchedulerName == "" {
return nil
}
return &patchOperation{Op: "add", Path: "/spec/schedulerName", Value: resGroupConfig.SchedulerName}
}
/*
Copyright 2021 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mutate
import (
v1 "k8s.io/api/core/v1"
wkconfig "volcano.sh/volcano/pkg/webhooks/config"
)
type namespaceResGroup struct{}
// NewNamespaceResGroup create a new structure
func NewNamespaceResGroup() ResGroup {
return &namespaceResGroup{}
}
// IsBelongResGroup adjust whether pod is belong to the resource group
func (resGroup *namespaceResGroup) IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool {
if resGroupConfig.Object.Key != "namespace" {
return false
}
for _, val := range resGroupConfig.Object.Value {
if pod.Namespace == val {
return true
}
}
return false
}
/*
Copyright 2019 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package validate
import (
"context"
"fmt"
"strconv"
"strings"
"k8s.io/api/admission/v1beta1"
whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/klog"
"volcano.sh/apis/pkg/apis/helpers"
vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/webhooks/router"
"volcano.sh/volcano/pkg/webhooks/schema"
"volcano.sh/volcano/pkg/webhooks/util"
)
func init() {
router.RegisterAdmission(service)
}
var service = &router.AdmissionService{
Path: "/pods/validate",
Func: AdmitPods,
Config: config,
ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
Webhooks: []whv1beta1.ValidatingWebhook{{
Name: "validatepod.volcano.sh",
Rules: []whv1beta1.RuleWithOperations{
{
Operations: []whv1beta1.OperationType{whv1beta1.Create},
Rule: whv1beta1.Rule{
APIGroups: []string{""},
APIVersions: []string{"v1"},
Resources: []string{"pods"},
},
},
},
}},
},
}
var config = &router.AdmissionServiceConfig{}
// AdmitPods is to admit pods and return response.
func AdmitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
klog.V(3).Infof("admitting pods -- %s", ar.Request.Operation)
pod, err := schema.DecodePod(ar.Request.Object, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
var msg string
reviewResponse := v1beta1.AdmissionResponse{}
reviewResponse.Allowed = true
switch ar.Request.Operation {
case v1beta1.Create:
msg = validatePod(pod, &reviewResponse)
default:
err := fmt.Errorf("expect operation to be 'CREATE'")
return util.ToAdmissionResponse(err)
}
if !reviewResponse.Allowed {
reviewResponse.Result = &metav1.Status{Message: strings.TrimSpace(msg)}
}
return &reviewResponse
}
/*
allow pods to create when
1. schedulerName of pod isn't volcano
2. pod has Podgroup whose phase isn't Pending
3. normal pods whose schedulerName is volcano don't have podgroup.
4. check pod budget annotations configure
*/
func validatePod(pod *v1.Pod, reviewResponse *v1beta1.AdmissionResponse) string {
if pod.Spec.SchedulerName != config.SchedulerName {
return ""
}
pgName := ""
msg := ""
// vc-job, SN == volcano
if pod.Annotations != nil {
pgName = pod.Annotations[vcv1beta1.KubeGroupNameAnnotationKey]
}
if pgName != "" {
if err := checkPGPhase(pod, pgName, true); err != nil {
msg = err.Error()
reviewResponse.Allowed = false
}
return msg
}
// normal pod, SN == volcano
pgName = helpers.GeneratePodgroupName(pod)
if err := checkPGPhase(pod, pgName, false); err != nil {
msg = err.Error()
reviewResponse.Allowed = false
}
// check pod annotatations
if err := validateAnnotation(pod); err != nil {
msg = err.Error()
reviewResponse.Allowed = false
}
return msg
}
func checkPGPhase(pod *v1.Pod, pgName string, isVCJob bool) error {
pg, err := config.VolcanoClient.SchedulingV1beta1().PodGroups(pod.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{})
if err != nil {
if isVCJob || (!isVCJob && !apierrors.IsNotFound(err)) {
return fmt.Errorf("failed to get PodGroup for pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
}
return nil
}
if pg.Status.Phase != vcv1beta1.PodGroupPending {
return nil
}
return fmt.Errorf("failed to create pod <%s/%s> as the podgroup phase is Pending",
pod.Namespace, pod.Name)
}
func validateAnnotation(pod *v1.Pod) error {
num := 0
if len(pod.Annotations) > 0 {
keys := []string{
vcv1beta1.JDBMinAvailable,
vcv1beta1.JDBMaxUnavailable,
}
for _, key := range keys {
if value, found := pod.Annotations[key]; found {
num++
if err := validateIntPercentageStr(key, value); err != nil {
recordEvent(err)
return err
}
}
}
if num > 1 {
return fmt.Errorf("not allow configure multiple annotations <%v> at same time", keys)
}
}
return nil
}
func recordEvent(err error) {
config.Recorder.Eventf(nil, v1.EventTypeWarning, "Admit", "Create pod failed due to %v", err)
}
func validateIntPercentageStr(key, value string) error {
tmp := intstr.Parse(value)
switch tmp.Type {
case intstr.Int:
if tmp.IntValue() <= 0 {
return fmt.Errorf("invalid value <%q> for %v, it must be a positive integer", value, key)
}
return nil
case intstr.String:
s := strings.Replace(tmp.StrVal, "%", "", -1)
v, err := strconv.Atoi(s)
if err != nil {
return fmt.Errorf("invalid value %v for %v", err, key)
}
if v <= 0 || v >= 100 {
return fmt.Errorf("invalid value <%q> for %v, it must be a valid percentage which between 1%% ~ 99%%", tmp.StrVal, key)
}
return nil
}
return fmt.Errorf("invalid type: neither int nor percentage for %v", key)
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mutate
import (
"encoding/json"
"fmt"
"strings"
"k8s.io/api/admission/v1beta1"
whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/webhooks/router"
"volcano.sh/volcano/pkg/webhooks/schema"
"volcano.sh/volcano/pkg/webhooks/util"
)
func init() {
router.RegisterAdmission(service)
}
var service = &router.AdmissionService{
Path: "/queues/mutate",
Func: Queues,
MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
Webhooks: []whv1beta1.MutatingWebhook{{
Name: "mutatequeue.volcano.sh",
Rules: []whv1beta1.RuleWithOperations{
{
Operations: []whv1beta1.OperationType{whv1beta1.Create},
Rule: whv1beta1.Rule{
APIGroups: []string{schedulingv1beta1.SchemeGroupVersion.Group},
APIVersions: []string{schedulingv1beta1.SchemeGroupVersion.Version},
Resources: []string{"queues"},
},
},
},
}},
},
}
type patchOperation struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}
// Queues mutate queues.
func Queues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
klog.V(3).Infof("Mutating %s queue %s.", ar.Request.Operation, ar.Request.Name)
queue, err := schema.DecodeQueue(ar.Request.Object, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
var patchBytes []byte
switch ar.Request.Operation {
case v1beta1.Create:
patchBytes, err = createQueuePatch(queue)
default:
return util.ToAdmissionResponse(fmt.Errorf("invalid operation `%s`, "+
"expect operation to be `CREATE`", ar.Request.Operation))
}
if err != nil {
return &v1beta1.AdmissionResponse{
Allowed: false,
Result: &metav1.Status{Message: err.Error()},
}
}
pt := v1beta1.PatchTypeJSONPatch
return &v1beta1.AdmissionResponse{
Allowed: true,
Patch: patchBytes,
PatchType: &pt,
}
}
func createQueuePatch(queue *schedulingv1beta1.Queue) ([]byte, error) {
var patch []patchOperation
// add root node if the root node not specified
hierarchy := queue.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
hierarchicalWeights := queue.Annotations[schedulingv1beta1.KubeHierarchyWeightAnnotationKey]
if hierarchy != "" && hierarchicalWeights != "" && !strings.HasPrefix(hierarchy, "root") {
// based on https://tools.ietf.org/html/rfc6901#section-3
// escape "/" with "~1"
patch = append(patch, patchOperation{
Op: "add",
Path: fmt.Sprintf("/metadata/annotations/%s", strings.ReplaceAll(schedulingv1beta1.KubeHierarchyAnnotationKey, "/", "~1")),
Value: fmt.Sprintf("root/%s", hierarchy),
})
patch = append(patch, patchOperation{
Op: "add",
Path: fmt.Sprintf("/metadata/annotations/%s", strings.ReplaceAll(schedulingv1beta1.KubeHierarchyWeightAnnotationKey, "/", "~1")),
Value: fmt.Sprintf("1/%s", hierarchicalWeights),
})
}
trueValue := true
if queue.Spec.Reclaimable == nil {
patch = append(patch, patchOperation{
Op: "add",
Path: "/spec/reclaimable",
Value: &trueValue,
})
}
defaultWeight := 1
if queue.Spec.Weight == 0 {
patch = append(patch, patchOperation{
Op: "add",
Path: "/spec/weight",
Value: &defaultWeight,
})
}
return json.Marshal(patch)
}
/*
Copyright 2018 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package validate
import (
"context"
"fmt"
"strconv"
"strings"
"k8s.io/api/admission/v1beta1"
whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/klog"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/webhooks/router"
"volcano.sh/volcano/pkg/webhooks/schema"
"volcano.sh/volcano/pkg/webhooks/util"
)
func init() {
router.RegisterAdmission(service)
}
var service = &router.AdmissionService{
Path: "/queues/validate",
Func: AdmitQueues,
Config: config,
ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
Webhooks: []whv1beta1.ValidatingWebhook{{
Name: "validatequeue.volcano.sh",
Rules: []whv1beta1.RuleWithOperations{
{
Operations: []whv1beta1.OperationType{whv1beta1.Create, whv1beta1.Update, whv1beta1.Delete},
Rule: whv1beta1.Rule{
APIGroups: []string{schedulingv1beta1.SchemeGroupVersion.Group},
APIVersions: []string{schedulingv1beta1.SchemeGroupVersion.Version},
Resources: []string{"queues"},
},
},
},
}},
},
}
var config = &router.AdmissionServiceConfig{}
// AdmitQueues is to admit queues and return response.
func AdmitQueues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
klog.V(3).Infof("Admitting %s queue %s.", ar.Request.Operation, ar.Request.Name)
queue, err := schema.DecodeQueue(ar.Request.Object, ar.Request.Resource)
if err != nil {
return util.ToAdmissionResponse(err)
}
switch ar.Request.Operation {
case v1beta1.Create, v1beta1.Update:
err = validateQueue(queue)
case v1beta1.Delete:
err = validateQueueDeleting(ar.Request.Name)
default:
return util.ToAdmissionResponse(fmt.Errorf("invalid operation `%s`, "+
"expect operation to be `CREATE`, `UPDATE` or `DELETE`", ar.Request.Operation))
}
if err != nil {
return &v1beta1.AdmissionResponse{
Allowed: false,
Result: &metav1.Status{Message: err.Error()},
}
}
return &v1beta1.AdmissionResponse{
Allowed: true,
}
}
func validateQueue(queue *schedulingv1beta1.Queue) error {
errs := field.ErrorList{}
resourcePath := field.NewPath("requestBody")
errs = append(errs, validateStateOfQueue(queue.Status.State, resourcePath.Child("spec").Child("state"))...)
errs = append(errs, validateWeightOfQueue(queue.Spec.Weight, resourcePath.Child("spec").Child("weight"))...)
errs = append(errs, validateHierarchicalAttributes(queue, resourcePath.Child("metadata").Child("annotations"))...)
if len(errs) > 0 {
return errs.ToAggregate()
}
return nil
}
func validateHierarchicalAttributes(queue *schedulingv1beta1.Queue, fldPath *field.Path) field.ErrorList {
errs := field.ErrorList{}
hierarchy := queue.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
hierarchicalWeights := queue.Annotations[schedulingv1beta1.KubeHierarchyWeightAnnotationKey]
if hierarchy != "" || hierarchicalWeights != "" {
paths := strings.Split(hierarchy, "/")
weights := strings.Split(hierarchicalWeights, "/")
// path length must be the same with weights length
if len(paths) != len(weights) {
return append(errs, field.Invalid(fldPath, hierarchy,
fmt.Sprintf("%s must have the same length with %s",
schedulingv1beta1.KubeHierarchyAnnotationKey,
schedulingv1beta1.KubeHierarchyWeightAnnotationKey,
)))
}
// check weights format
for _, weight := range weights {
weightFloat, err := strconv.ParseFloat(weight, 64)
if err != nil {
return append(errs, field.Invalid(fldPath, hierarchicalWeights,
fmt.Sprintf("%s in the %s is invalid number: %v",
weight, hierarchicalWeights, err,
)))
}
if weightFloat <= 0 {
return append(errs, field.Invalid(fldPath, hierarchicalWeights,
fmt.Sprintf("%s in the %s must be larger than 0",
weight, hierarchicalWeights,
)))
}
}
// The node is not allowed to be in the sub path of a node.
// For example, a queue with "root/sci" conflicts with a queue with "root/sci/dev"
queueList, err := config.VolcanoClient.SchedulingV1beta1().Queues().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return append(errs, field.Invalid(fldPath, hierarchy,
fmt.Sprintf("checking %s, list queues failed: %v",
schedulingv1beta1.KubeHierarchyAnnotationKey,
err,
)))
}
for _, queueInTree := range queueList.Items {
hierarchyInTree := queueInTree.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
if hierarchyInTree != "" && queue.Name != queueInTree.Name &&
strings.HasPrefix(hierarchyInTree, hierarchy) {
return append(errs, field.Invalid(fldPath, hierarchy,
fmt.Sprintf("%s is not allowed to be in the sub path of %s of queue %s",
hierarchy, hierarchyInTree, queueInTree.Name)))
}
}
}
return errs
}
func validateStateOfQueue(value schedulingv1beta1.QueueState, fldPath *field.Path) field.ErrorList {
errs := field.ErrorList{}
if len(value) == 0 {
return errs
}
validQueueStates := []schedulingv1beta1.QueueState{
schedulingv1beta1.QueueStateOpen,
schedulingv1beta1.QueueStateClosed,
}
for _, validQueue := range validQueueStates {
if value == validQueue {
return errs
}
}
return append(errs, field.Invalid(fldPath, value, fmt.Sprintf("queue state must be in %v", validQueueStates)))
}
func validateWeightOfQueue(value int32, fldPath *field.Path) field.ErrorList {
errs := field.ErrorList{}
if value > 0 {
return errs
}
return append(errs, field.Invalid(fldPath, value, "queue weight must be a positive integer"))
}
func validateQueueDeleting(queue string) error {
if queue == "default" {
return fmt.Errorf("`%s` queue can not be deleted", "default")
}
q, err := config.VolcanoClient.SchedulingV1beta1().Queues().Get(context.TODO(), queue, metav1.GetOptions{})
if err != nil {
return err
}
if q.Status.State != schedulingv1beta1.QueueStateClosed {
return fmt.Errorf("only queue with state `%s` can be deleted, queue `%s` state is `%s`",
schedulingv1beta1.QueueStateClosed, q.Name, q.Status.State)
}
return nil
}