|
|
|
@@ -22,7 +22,7 @@ const ( |
|
|
|
storageTypeOBS = "obs" |
|
|
|
autoStopDuration = 4 * 60 * 60 |
|
|
|
autoStopDurationMs = 4 * 60 * 60 * 1000 |
|
|
|
MORDELART_USER_IMAGE_ENGINE_ID = 1000000 |
|
|
|
MORDELART_USER_IMAGE_ENGINE_ID = -1 |
|
|
|
DataSetMountPath = "/home/ma-user/work" |
|
|
|
NotebookEnv = "Python3" |
|
|
|
NotebookType = "Ascend" |
|
|
|
@@ -315,106 +315,58 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func GenerateTrainJobByUserImage(ctx *context.Context, req *GenerateTrainJobReq) (err error) { |
|
|
|
createTime := timeutil.TimeStampNow() |
|
|
|
jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{ |
|
|
|
JobName: req.JobName, |
|
|
|
Description: req.Description, |
|
|
|
Config: models.UserImageConfig{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
CreateVersion: true, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
UserImageUrl: req.UserImageUrl, |
|
|
|
UserCommand: req.UserCommand, |
|
|
|
}, |
|
|
|
}) |
|
|
|
if err != nil { |
|
|
|
log.Error("CreateJob failed: %v", err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
|
|
|
|
jobId := strconv.FormatInt(jobResult.JobID, 10) |
|
|
|
err = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
|
Status: TransTrainJobStatus(jobResult.Status), |
|
|
|
UserID: ctx.User.ID, |
|
|
|
RepoID: ctx.Repo.Repository.ID, |
|
|
|
JobID: jobId, |
|
|
|
JobName: req.JobName, |
|
|
|
DisplayJobName: req.DisplayJobName, |
|
|
|
JobType: string(models.JobTypeTrain), |
|
|
|
Type: models.TypeCloudBrainTwo, |
|
|
|
VersionID: jobResult.VersionID, |
|
|
|
VersionName: jobResult.VersionName, |
|
|
|
Uuid: req.Uuid, |
|
|
|
DatasetName: req.DatasetName, |
|
|
|
CommitID: req.CommitID, |
|
|
|
IsLatestVersion: req.IsLatestVersion, |
|
|
|
ComputeResource: models.NPUResource, |
|
|
|
EngineID: MORDELART_USER_IMAGE_ENGINE_ID, |
|
|
|
Image: req.UserImageUrl, |
|
|
|
UserImageCommand: req.UserCommand, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
BranchName: req.BranchName, |
|
|
|
Parameters: req.Params, |
|
|
|
BootFile: req.BootFile, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
FlavorCode: req.FlavorCode, |
|
|
|
Description: req.Description, |
|
|
|
WorkServerNumber: req.WorkServerNumber, |
|
|
|
FlavorName: req.FlavorName, |
|
|
|
EngineName: req.EngineName, |
|
|
|
VersionCount: req.VersionCount, |
|
|
|
TotalVersionCount: req.TotalVersionCount, |
|
|
|
CreatedUnix: createTime, |
|
|
|
UpdatedUnix: createTime, |
|
|
|
}) |
|
|
|
|
|
|
|
if err != nil { |
|
|
|
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { |
|
|
|
createTime := timeutil.TimeStampNow() |
|
|
|
jobResult, err := createTrainJob(models.CreateTrainJobParams{ |
|
|
|
JobName: req.JobName, |
|
|
|
Description: req.Description, |
|
|
|
Config: models.Config{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
EngineID: req.EngineID, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
CreateVersion: true, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
var jobResult *models.CreateTrainJobResult |
|
|
|
var createErr error |
|
|
|
if req.EngineID < 0 { |
|
|
|
jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{ |
|
|
|
JobName: req.JobName, |
|
|
|
Description: req.Description, |
|
|
|
Config: models.UserImageConfig{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
CreateVersion: true, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
UserImageUrl: req.UserImageUrl, |
|
|
|
UserCommand: req.UserCommand, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
}, |
|
|
|
}) |
|
|
|
if err != nil { |
|
|
|
log.Error("CreateJob failed: %v", err.Error()) |
|
|
|
return err |
|
|
|
}) |
|
|
|
} else { |
|
|
|
jobResult, createErr = createTrainJob(models.CreateTrainJobParams{ |
|
|
|
JobName: req.JobName, |
|
|
|
Description: req.Description, |
|
|
|
Config: models.Config{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
EngineID: req.EngineID, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
CreateVersion: true, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
}, |
|
|
|
}) |
|
|
|
} |
|
|
|
if createErr != nil { |
|
|
|
log.Error("CreateJob failed: %v", createErr.Error()) |
|
|
|
return createErr |
|
|
|
} |
|
|
|
|
|
|
|
jobId := strconv.FormatInt(jobResult.JobID, 10) |
|
|
|
err = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
|
createErr = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
|
Status: TransTrainJobStatus(jobResult.Status), |
|
|
|
UserID: ctx.User.ID, |
|
|
|
RepoID: ctx.Repo.Repository.ID, |
|
|
|
@@ -448,9 +400,9 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error |
|
|
|
UpdatedUnix: createTime, |
|
|
|
}) |
|
|
|
|
|
|
|
if err != nil { |
|
|
|
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) |
|
|
|
return err |
|
|
|
if createErr != nil { |
|
|
|
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error()) |
|
|
|
return createErr |
|
|
|
} |
|
|
|
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) |
|
|
|
return nil |
|
|
|
@@ -482,45 +434,69 @@ func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrain |
|
|
|
|
|
|
|
func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { |
|
|
|
createTime := timeutil.TimeStampNow() |
|
|
|
jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ |
|
|
|
Description: req.Description, |
|
|
|
Config: models.TrainJobVersionConfig{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
EngineID: req.EngineID, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
var jobResult *models.CreateTrainJobResult |
|
|
|
var createErr error |
|
|
|
if req.EngineID < 0 { |
|
|
|
jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{ |
|
|
|
Description: req.Description, |
|
|
|
Config: models.TrainJobVersionConfig{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
PreVersionId: req.PreVersionId, |
|
|
|
UserImageUrl: req.UserImageUrl, |
|
|
|
UserCommand: req.UserCommand, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
PreVersionId: req.PreVersionId, |
|
|
|
}, |
|
|
|
}, jobId) |
|
|
|
if err != nil { |
|
|
|
log.Error("CreateJob failed: %v", err.Error()) |
|
|
|
return err |
|
|
|
}, jobId) |
|
|
|
} else { |
|
|
|
jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{ |
|
|
|
Description: req.Description, |
|
|
|
Config: models.TrainJobVersionConfig{ |
|
|
|
WorkServerNum: req.WorkServerNumber, |
|
|
|
AppUrl: req.CodeObsPath, |
|
|
|
BootFileUrl: req.BootFileUrl, |
|
|
|
DataUrl: req.DataUrl, |
|
|
|
EngineID: req.EngineID, |
|
|
|
TrainUrl: req.TrainUrl, |
|
|
|
LogUrl: req.LogUrl, |
|
|
|
PoolID: req.PoolID, |
|
|
|
Flavor: models.Flavor{ |
|
|
|
Code: req.FlavorCode, |
|
|
|
}, |
|
|
|
Parameter: req.Parameters, |
|
|
|
PreVersionId: req.PreVersionId, |
|
|
|
}, |
|
|
|
}, jobId) |
|
|
|
} |
|
|
|
if createErr != nil { |
|
|
|
log.Error("CreateJob failed: %v", createErr.Error()) |
|
|
|
return createErr |
|
|
|
} |
|
|
|
|
|
|
|
var jobTypes []string |
|
|
|
jobTypes = append(jobTypes, string(models.JobTypeTrain)) |
|
|
|
repo := ctx.Repo.Repository |
|
|
|
VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ |
|
|
|
VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ |
|
|
|
RepoID: repo.ID, |
|
|
|
Type: models.TypeCloudBrainTwo, |
|
|
|
JobTypes: jobTypes, |
|
|
|
JobID: strconv.FormatInt(jobResult.JobID, 10), |
|
|
|
}) |
|
|
|
if err != nil { |
|
|
|
ctx.ServerError("Cloudbrain", err) |
|
|
|
return err |
|
|
|
if createErr != nil { |
|
|
|
ctx.ServerError("Cloudbrain", createErr) |
|
|
|
return createErr |
|
|
|
} |
|
|
|
//将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount |
|
|
|
|
|
|
|
err = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
|
createErr = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
|
Status: TransTrainJobStatus(jobResult.Status), |
|
|
|
UserID: ctx.User.ID, |
|
|
|
RepoID: ctx.Repo.Repository.ID, |
|
|
|
@@ -555,19 +531,19 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job |
|
|
|
CreatedUnix: createTime, |
|
|
|
UpdatedUnix: createTime, |
|
|
|
}) |
|
|
|
if err != nil { |
|
|
|
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) |
|
|
|
return err |
|
|
|
if createErr != nil { |
|
|
|
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) |
|
|
|
return createErr |
|
|
|
} |
|
|
|
|
|
|
|
//将训练任务的上一版本的isLatestVersion设置为"0" |
|
|
|
err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount) |
|
|
|
if err != nil { |
|
|
|
ctx.ServerError("Update IsLatestVersion failed", err) |
|
|
|
return err |
|
|
|
createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount) |
|
|
|
if createErr != nil { |
|
|
|
ctx.ServerError("Update IsLatestVersion failed", createErr) |
|
|
|
return createErr |
|
|
|
} |
|
|
|
|
|
|
|
return err |
|
|
|
return createErr |
|
|
|
} |
|
|
|
|
|
|
|
func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { |
|
|
|
|