| @@ -403,6 +403,8 @@ type BenchmarkDataset struct { | |||||
| Id int `json:"id"` | Id int `json:"id"` | ||||
| Value string `json:"value"` //二级算法类型名称 | Value string `json:"value"` //二级算法类型名称 | ||||
| Attachment string `json:"attachment"` //数据集的uuid | Attachment string `json:"attachment"` //数据集的uuid | ||||
| Owner string `json:"owner"` //评估脚本所在仓库的拥有者 | |||||
| RepoName string `json:"repo_name"` //评估脚本所在仓库的名称 | |||||
| } | } | ||||
| type GpuInfos struct { | type GpuInfos struct { | ||||
| @@ -476,7 +478,7 @@ type MatchInfo struct { | |||||
| type GetJobLogResult struct { | type GetJobLogResult struct { | ||||
| ScrollID string `json:"_scroll_id"` | ScrollID string `json:"_scroll_id"` | ||||
| Took int `json:"took"` | |||||
| Took int `json:"took"` | |||||
| TimedOut bool `json:"timed_out"` | TimedOut bool `json:"timed_out"` | ||||
| Shards struct { | Shards struct { | ||||
| Total int `json:"total"` | Total int `json:"total"` | ||||
| @@ -485,18 +487,34 @@ type GetJobLogResult struct { | |||||
| Failed int `json:"failed"` | Failed int `json:"failed"` | ||||
| } `json:"_shards"` | } `json:"_shards"` | ||||
| Hits struct { | Hits struct { | ||||
| Hits []struct { | |||||
| Index string `json:"_index"` | |||||
| Type string `json:"_type"` | |||||
| ID string `json:"_id"` | |||||
| Source struct { | |||||
| Message string `json:"message"` | |||||
| } `json:"_source"` | |||||
| Sort []int `json:"sort"` | |||||
| } `json:"hits"` | |||||
| Hits []Hits `json:"hits"` | |||||
| } `json:"hits"` | } `json:"hits"` | ||||
| } | } | ||||
| type Hits struct { | |||||
| Index string `json:"_index"` | |||||
| Type string `json:"_type"` | |||||
| ID string `json:"_id"` | |||||
| Source struct { | |||||
| Message string `json:"message"` | |||||
| } `json:"_source"` | |||||
| Sort []int `json:"sort"` | |||||
| } | |||||
| type GetAllJobLogParams struct { | |||||
| Scroll string `json:"scroll"` | |||||
| ScrollID string `json:"scroll_id"` | |||||
| } | |||||
| type DeleteJobLogTokenParams struct { | |||||
| ScrollID string `json:"scroll_id"` | |||||
| } | |||||
| type DeleteJobLogTokenResult struct { | |||||
| Succeeded bool `json:"succeeded"` | |||||
| NumFreed int `json:"num_freed"` | |||||
| } | |||||
| type CloudBrainResult struct { | type CloudBrainResult struct { | ||||
| Code string `json:"code"` | Code string `json:"code"` | ||||
| Msg string `json:"msg"` | Msg string `json:"msg"` | ||||
| @@ -17,7 +17,8 @@ const ( | |||||
| Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple; | Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple; | ||||
| service ssh stop; | service ssh stop; | ||||
| jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` | jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` | ||||
| CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` | |||||
| //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` | |||||
| CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` | |||||
| CodeMountPath = "/code" | CodeMountPath = "/code" | ||||
| DataSetMountPath = "/dataset" | DataSetMountPath = "/dataset" | ||||
| ModelMountPath = "/model" | ModelMountPath = "/model" | ||||
| @@ -26,6 +26,8 @@ const ( | |||||
| JobHasBeenStopped = "S410" | JobHasBeenStopped = "S410" | ||||
| Public = "public" | Public = "public" | ||||
| Custom = "custom" | Custom = "custom" | ||||
| LogPageSize = 500 | |||||
| LogPageTokenExpired = "5m" | |||||
| ) | ) | ||||
| func getRestyClient() *resty.Client { | func getRestyClient() *resty.Client { | ||||
| @@ -279,7 +281,7 @@ func GetJobLog(jobID string) (*models.GetJobLogResult, error) { | |||||
| client := getRestyClient() | client := getRestyClient() | ||||
| var result models.GetJobLogResult | var result models.GetJobLogResult | ||||
| req := models.GetJobLogParams{ | req := models.GetJobLogParams{ | ||||
| Size: "5000", | |||||
| Size: strconv.Itoa(LogPageSize), | |||||
| Sort: "log.offset", | Sort: "log.offset", | ||||
| QueryInfo: models.QueryInfo{ | QueryInfo: models.QueryInfo{ | ||||
| MatchInfo: models.MatchInfo{ | MatchInfo: models.MatchInfo{ | ||||
| @@ -293,17 +295,79 @@ func GetJobLog(jobID string) (*models.GetJobLogResult, error) { | |||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetBody(req). | SetBody(req). | ||||
| SetResult(&result). | SetResult(&result). | ||||
| Post(HOST + "es/_search?_source=message&scroll=5m") | |||||
| Post(HOST + "es/_search?_source=message&scroll=" + LogPageTokenExpired) | |||||
| if err != nil { | if err != nil { | ||||
| log.Info("GetJobLog failed: %v", err) | |||||
| log.Error("GetJobLog failed: %v", err) | |||||
| return &result, fmt.Errorf("resty GetJobLog: %v, %s", err, res.String()) | return &result, fmt.Errorf("resty GetJobLog: %v, %s", err, res.String()) | ||||
| } | } | ||||
| if !strings.Contains(res.Status(), strconv.Itoa(http.StatusOK)) { | if !strings.Contains(res.Status(), strconv.Itoa(http.StatusOK)) { | ||||
| log.Info("res.Status(): %s, response: %s", res.Status(), res.String()) | |||||
| log.Error("res.Status(): %s, response: %s", res.Status(), res.String()) | |||||
| return &result, errors.New(res.String()) | return &result, errors.New(res.String()) | ||||
| } | } | ||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func GetJobAllLog(scrollID string) (*models.GetJobLogResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetJobLogResult | |||||
| req := models.GetAllJobLogParams{ | |||||
| Scroll: LogPageTokenExpired, | |||||
| ScrollID: scrollID, | |||||
| } | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetBody(req). | |||||
| SetResult(&result). | |||||
| Post(HOST + "es/_search/scroll") | |||||
| if err != nil { | |||||
| log.Error("GetJobAllLog failed: %v", err) | |||||
| return &result, fmt.Errorf("resty GetJobAllLog: %v, %s", err, res.String()) | |||||
| } | |||||
| if !strings.Contains(res.Status(), strconv.Itoa(http.StatusOK)) { | |||||
| log.Error("res.Status(): %s, response: %s", res.Status(), res.String()) | |||||
| return &result, errors.New(res.String()) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| func DeleteJobLogToken(scrollID string) (error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.DeleteJobLogTokenResult | |||||
| req := models.DeleteJobLogTokenParams{ | |||||
| ScrollID: scrollID, | |||||
| } | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetBody(req). | |||||
| SetResult(&result). | |||||
| Delete(HOST + "es/_search/scroll") | |||||
| if err != nil { | |||||
| log.Error("DeleteJobLogToken failed: %v", err) | |||||
| return fmt.Errorf("resty DeleteJobLogToken: %v, %s", err, res.String()) | |||||
| } | |||||
| if !strings.Contains(res.Status(), strconv.Itoa(http.StatusOK)) { | |||||
| log.Error("res.Status(): %s, response: %s", res.Status(), res.String()) | |||||
| return errors.New(res.String()) | |||||
| } | |||||
| if !result.Succeeded { | |||||
| log.Error("DeleteJobLogToken failed") | |||||
| return errors.New("DeleteJobLogToken failed") | |||||
| } | |||||
| return nil | |||||
| } | |||||
| @@ -102,25 +102,45 @@ func CloudbrainGetLog(ctx *context.Context) { | |||||
| return | return | ||||
| } | } | ||||
| var hits []models.Hits | |||||
| result, err := cloudbrain.GetJobLog(jobID) | result, err := cloudbrain.GetJobLog(jobID) | ||||
| if err != nil{ | if err != nil{ | ||||
| log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) | log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) | ||||
| ctx.ServerError(err.Error(), err) | ctx.ServerError(err.Error(), err) | ||||
| return | return | ||||
| } | } | ||||
| hits = result.Hits.Hits | |||||
| //if the size equal page_size, then take the scroll_id to get all log and delete the scroll_id(the num of scroll_id is limited) | |||||
| if len(result.Hits.Hits) >= cloudbrain.LogPageSize { | |||||
| for { | |||||
| resultNext, err := cloudbrain.GetJobAllLog(result.ScrollID) | |||||
| if err != nil{ | |||||
| log.Error("GetJobAllLog failed: %v", err, ctx.Data["MsgID"]) | |||||
| } else { | |||||
| for _, hit := range resultNext.Hits.Hits { | |||||
| hits = append(hits, hit) | |||||
| } | |||||
| } | |||||
| if len(resultNext.Hits.Hits) < cloudbrain.LogPageSize { | |||||
| log.Info("get all log already") | |||||
| break | |||||
| } | |||||
| } | |||||
| } | |||||
| sort.Slice(result.Hits.Hits, func(i, j int) bool { | |||||
| return result.Hits.Hits[i].Sort[0] < result.Hits.Hits[j].Sort[0] | |||||
| cloudbrain.DeleteJobLogToken(result.ScrollID) | |||||
| sort.Slice(hits, func(i, j int) bool { | |||||
| return hits[i].Sort[0] < hits[j].Sort[0] | |||||
| }) | }) | ||||
| log.Info("%v", result.Hits.Hits) | |||||
| var content []string | |||||
| for _, log := range result.Hits.Hits { | |||||
| content = append(content, log.Source.Message + "\n") | |||||
| var content string | |||||
| for _, log := range hits { | |||||
| content += log.Source.Message + "\n" | |||||
| } | } | ||||
| log.Info("%v", content) | |||||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | ctx.JSON(http.StatusOK, map[string]interface{}{ | ||||
| "JobID": jobID, | "JobID": jobID, | ||||
| "Content": content, | "Content": content, | ||||
| @@ -1062,12 +1062,12 @@ func CloudBrainBenchmarkNew(ctx *context.Context) { | |||||
| ctx.HTML(200, tplCloudBrainBenchmarkNew) | ctx.HTML(200, tplCloudBrainBenchmarkNew) | ||||
| } | } | ||||
| func getBenchmarkAttachment(benchmarkTypeID, benchmarkChildTypeID int) (string, error) { | |||||
| uuid := "" | |||||
| func getBenchmarkAttachment(benchmarkTypeID, benchmarkChildTypeID int) (*models.BenchmarkDataset, error) { | |||||
| var childInfo *models.BenchmarkDataset | |||||
| if benchmarkTypes == nil { | if benchmarkTypes == nil { | ||||
| if err := json.Unmarshal([]byte(setting.BenchmarkTypes), &benchmarkTypes); err != nil { | if err := json.Unmarshal([]byte(setting.BenchmarkTypes), &benchmarkTypes); err != nil { | ||||
| log.Error("json.Unmarshal BenchmarkTypes(%s) failed:%v", setting.BenchmarkTypes, err) | log.Error("json.Unmarshal BenchmarkTypes(%s) failed:%v", setting.BenchmarkTypes, err) | ||||
| return uuid, err | |||||
| return childInfo, err | |||||
| } | } | ||||
| } | } | ||||
| @@ -1076,7 +1076,7 @@ func getBenchmarkAttachment(benchmarkTypeID, benchmarkChildTypeID int) (string, | |||||
| if benchmarkType.Id == benchmarkTypeID { | if benchmarkType.Id == benchmarkTypeID { | ||||
| for _, childType := range benchmarkType.Second { | for _, childType := range benchmarkType.Second { | ||||
| if childType.Id == benchmarkChildTypeID { | if childType.Id == benchmarkChildTypeID { | ||||
| uuid = childType.Attachment | |||||
| childInfo = childType | |||||
| isExist = true | isExist = true | ||||
| break | break | ||||
| } | } | ||||
| @@ -1087,10 +1087,10 @@ func getBenchmarkAttachment(benchmarkTypeID, benchmarkChildTypeID int) (string, | |||||
| if !isExist { | if !isExist { | ||||
| log.Error("no such benchmark_type_id&benchmark_child_type_id") | log.Error("no such benchmark_type_id&benchmark_child_type_id") | ||||
| return uuid, errors.New("no such benchmark_type_id&benchmark_child_type_id") | |||||
| return childInfo, errors.New("no such benchmark_type_id&benchmark_child_type_id") | |||||
| } | } | ||||
| return uuid, nil | |||||
| return childInfo, nil | |||||
| } | } | ||||
| func getBenchmarkGpuQueue(gpuQueue string) (string, error) { | func getBenchmarkGpuQueue(gpuQueue string) (string, error) { | ||||
| @@ -1161,7 +1161,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF | |||||
| return | return | ||||
| } | } | ||||
| uuid, err := getBenchmarkAttachment(benchmarkTypeID, benchmarkChildTypeID) | |||||
| childInfo, err := getBenchmarkAttachment(benchmarkTypeID, benchmarkChildTypeID) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("getBenchmarkAttachment failed:%v", err, ctx.Data["MsgID"]) | log.Error("getBenchmarkAttachment failed:%v", err, ctx.Data["MsgID"]) | ||||
| cloudBrainNewDataPrepare(ctx) | cloudBrainNewDataPrepare(ctx) | ||||
| @@ -1240,7 +1240,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF | |||||
| } | } | ||||
| } | } | ||||
| if err := downloadRateCode(repo, jobName, setting.BenchmarkOwner, setting.BenchmarkName, benchmarkPath, form.BenchmarkCategory, gpuType); err != nil { | |||||
| if err := downloadRateCode(repo, jobName, childInfo.Owner, childInfo.RepoName, benchmarkPath, form.BenchmarkCategory, gpuType); err != nil { | |||||
| log.Error("downloadRateCode failed, %v", err, ctx.Data["MsgID"]) | log.Error("downloadRateCode failed, %v", err, ctx.Data["MsgID"]) | ||||
| //cloudBrainNewDataPrepare(ctx) | //cloudBrainNewDataPrepare(ctx) | ||||
| //ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) | //ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) | ||||
| @@ -1254,7 +1254,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF | |||||
| //return | //return | ||||
| } | } | ||||
| err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), | |||||
| err = cloudbrain.GenerateTask(ctx, jobName, image, command, childInfo.Attachment, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), | |||||
| storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), | ||||
| storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), | ||||
| storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, | storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, | ||||