You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "strconv"
  11. "strings"
  12. "time"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/aisafety"
  15. "code.gitea.io/gitea/modules/cloudbrain"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/setting"
  21. "code.gitea.io/gitea/modules/storage"
  22. "code.gitea.io/gitea/modules/util"
  23. "code.gitea.io/gitea/services/cloudbrain/resource"
  24. "code.gitea.io/gitea/services/reward/point/account"
  25. uuid "github.com/satori/go.uuid"
  26. )
  27. const (
  28. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  29. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  30. )
  31. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  32. log.Info("start to create CloudBrainAiSafetyCreate")
  33. uuid := uuid.NewV4()
  34. id := uuid.String()
  35. seriaNoParas := ctx.Query("serialNo")
  36. fileName := ctx.Query("fileName")
  37. //if jobType == string(models.JobTypeBenchmark) {
  38. req := aisafety.TaskReq{
  39. UnionId: id,
  40. EvalName: "test1",
  41. EvalContent: "test1",
  42. TLPath: "test1",
  43. Indicators: []string{"ACC", "ASS"},
  44. CDName: "CIFAR10_1000_FGSM",
  45. BDName: "CIFAR10_1000基础数据集",
  46. }
  47. aisafety.GetAlgorithmList()
  48. if seriaNoParas != "" {
  49. aisafety.GetTaskStatus(seriaNoParas)
  50. } else {
  51. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  52. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  53. if err == nil {
  54. log.Info("serialNo=" + serialNo)
  55. time.Sleep(time.Duration(2) * time.Second)
  56. aisafety.GetTaskStatus(serialNo)
  57. } else {
  58. log.Info("CreateSafetyTask error," + err.Error())
  59. }
  60. }
  61. }
  62. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  63. if job == nil {
  64. log.Error("GetCloudbrainByJobID failed")
  65. return
  66. }
  67. syncAiSafetyTaskStatus(job)
  68. }
  69. func GetAiSafetyTask(ctx *context.Context) {
  70. var ID = ctx.Params(":jobid")
  71. job, err := models.GetCloudbrainByJobIDWithDeleted(ID)
  72. if err != nil {
  73. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  74. return
  75. }
  76. syncAiSafetyTaskStatus(job)
  77. job, err = models.GetCloudbrainByJobIDWithDeleted(ID)
  78. ctx.JSON(200, job)
  79. }
  80. func StopAiSafetyTask(ctx *context.Context) {
  81. }
  82. func DelAiSafetyTask(ctx *context.Context) {
  83. var ID = ctx.Params(":jobid")
  84. task, err := models.GetCloudbrainByJobIDWithDeleted(ID)
  85. result := make(map[string]interface{})
  86. result["code"] = 1
  87. if err != nil {
  88. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  89. result["msg"] = "No such task."
  90. ctx.JSON(200, result)
  91. return
  92. }
  93. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  94. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  95. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  96. ctx.JSON(200, result)
  97. return
  98. }
  99. err = models.DeleteJob(task)
  100. if err != nil {
  101. result["msg"] = err.Error()
  102. ctx.JSON(200, result)
  103. return
  104. }
  105. result["code"] = 0
  106. result["msg"] = "Succeed"
  107. ctx.JSON(200, result)
  108. }
  109. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  110. if isTaskNotFinished(job.Status) {
  111. if job.Type == models.TypeCloudBrainTwo {
  112. queryTaskStatusFromCloudbrainTwo(job)
  113. } else if job.Type == models.TypeCloudBrainOne {
  114. queryTaskStatusFromCloudbrain(job)
  115. }
  116. } else {
  117. if job.Status == string(models.ModelSafetyTesting) {
  118. queryTaskStatusFromModelSafetyTestServer(job)
  119. } else {
  120. log.Info("The job is finished. status=" + job.Status)
  121. }
  122. }
  123. }
  124. func TimerHandleModelSafetyTestTask() {
  125. tasks, err := models.GetModelSafetyTestTask()
  126. if err == nil {
  127. if tasks != nil && len(tasks) > 0 {
  128. for _, job := range tasks {
  129. syncAiSafetyTaskStatus(job)
  130. }
  131. } else {
  132. log.Info("query running model safety test task 0.")
  133. }
  134. } else {
  135. log.Info("query running model safety test task err." + err.Error())
  136. }
  137. }
  138. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  139. log.Info("The task not finished,name=" + job.DisplayJobName)
  140. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  141. if err != nil {
  142. log.Info("query train job error." + err.Error())
  143. return
  144. }
  145. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  146. job.Duration = result.Duration
  147. job.TrainJobDuration = result.TrainJobDuration
  148. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  149. err = models.UpdateJob(job)
  150. if err != nil {
  151. log.Error("UpdateJob failed:", err)
  152. }
  153. } else {
  154. job.Status = string(models.ModelSafetyTesting)
  155. err = models.UpdateJob(job)
  156. if err != nil {
  157. log.Error("UpdateJob failed:", err)
  158. }
  159. //send msg to beihang
  160. sendNPUInferenceResultToTest(job)
  161. }
  162. }
  163. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  164. datasetname := job.DatasetName
  165. datasetnames := strings.Split(datasetname, ";")
  166. indicator := job.LabelName
  167. req := aisafety.TaskReq{
  168. UnionId: job.JobID,
  169. EvalName: job.DisplayJobName,
  170. EvalContent: job.Description,
  171. TLPath: "test",
  172. Indicators: strings.Split(indicator, ";"),
  173. CDName: datasetnames[1],
  174. BDName: datasetnames[0],
  175. }
  176. jsonContent := ""
  177. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  178. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  179. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  180. if err != nil {
  181. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  182. } else {
  183. defer body.Close()
  184. var data []byte
  185. p := make([]byte, 4096)
  186. var readErr error
  187. var readCount int
  188. for {
  189. readCount, readErr = body.Read(p)
  190. if readCount > 0 {
  191. data = append(data, p[:readCount]...)
  192. }
  193. if readErr != nil || readCount == 0 {
  194. break
  195. }
  196. }
  197. jsonContent = string(data)
  198. }
  199. if jsonContent != "" {
  200. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  201. if err == nil {
  202. //update serial no to db
  203. job.PreVersionName = serialNo
  204. err = models.UpdateJob(job)
  205. if err != nil {
  206. log.Error("UpdateJob failed:", err)
  207. }
  208. }
  209. } else {
  210. log.Info("The json is null. so set it failed.")
  211. //update task failed.
  212. job.Status = string(models.ModelArtsTrainJobFailed)
  213. err := models.UpdateJob(job)
  214. if err != nil {
  215. log.Error("UpdateJob failed:", err)
  216. }
  217. }
  218. }
  219. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  220. log.Info("The task not finished,name=" + job.DisplayJobName)
  221. jobResult, err := cloudbrain.GetJob(job.JobID)
  222. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  223. if err != nil {
  224. log.Error("ConvertToJobResultPayload failed:", err)
  225. return
  226. }
  227. job.Status = result.JobStatus.State
  228. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  229. taskRoles := result.TaskRoles
  230. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  231. job.Status = taskRes.TaskStatuses[0].State
  232. }
  233. if result.JobStatus.State != string(models.JobSucceeded) {
  234. err = models.UpdateJob(job)
  235. if err != nil {
  236. log.Error("UpdateJob failed:", err)
  237. }
  238. } else {
  239. //
  240. job.Status = string(models.ModelSafetyTesting)
  241. err = models.UpdateJob(job)
  242. if err != nil {
  243. log.Error("UpdateJob failed:", err)
  244. }
  245. //send msg to beihang
  246. sendGPUInferenceResultToTest(job)
  247. }
  248. }
  249. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  250. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  251. if err == nil {
  252. if result.Code == "0" {
  253. if result.Data.Status == 1 {
  254. log.Info("The task is running....")
  255. } else {
  256. if result.Data.Code == 0 {
  257. job.ResultJson = result.Data.StandardJson
  258. err = models.UpdateJob(job)
  259. if err != nil {
  260. log.Error("UpdateJob failed:", err)
  261. }
  262. }
  263. }
  264. } else {
  265. log.Info("The task is failed.")
  266. job.Status = string(models.JobFailed)
  267. err = models.UpdateJob(job)
  268. if err != nil {
  269. log.Error("UpdateJob failed:", err)
  270. }
  271. }
  272. } else {
  273. log.Info("The task not found.....")
  274. }
  275. }
  276. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  277. datasetname := job.DatasetName
  278. datasetnames := strings.Split(datasetname, ";")
  279. indicator := job.LabelName
  280. req := aisafety.TaskReq{
  281. UnionId: job.JobID,
  282. EvalName: job.DisplayJobName,
  283. EvalContent: job.Description,
  284. TLPath: "test",
  285. Indicators: strings.Split(indicator, ";"),
  286. CDName: datasetnames[1],
  287. BDName: datasetnames[0],
  288. }
  289. resultDir := "/model"
  290. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  291. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  292. if err != nil {
  293. log.Error("query cloudbrain one model failed: %v", err)
  294. return
  295. }
  296. jsonContent := ""
  297. for _, file := range files {
  298. if strings.HasSuffix(file.FileName, "result.json") {
  299. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  300. log.Info("path=" + path)
  301. reader, err := os.Open(path)
  302. defer reader.Close()
  303. if err == nil {
  304. r := bufio.NewReader(reader)
  305. for {
  306. line, error := r.ReadString('\n')
  307. if error == io.EOF {
  308. log.Info("read file completed.")
  309. break
  310. }
  311. if error != nil {
  312. log.Info("read file error." + error.Error())
  313. break
  314. }
  315. jsonContent += line
  316. }
  317. }
  318. break
  319. }
  320. }
  321. if jsonContent != "" {
  322. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  323. if err == nil {
  324. //update serial no to db
  325. job.PreVersionName = serialNo
  326. err = models.UpdateJob(job)
  327. if err != nil {
  328. log.Error("UpdateJob failed:", err)
  329. }
  330. }
  331. } else {
  332. log.Info("The json is null. so set it failed.")
  333. //update task failed.
  334. job.Status = string(models.JobFailed)
  335. err = models.UpdateJob(job)
  336. if err != nil {
  337. log.Error("UpdateJob failed:", err)
  338. }
  339. }
  340. }
  341. func isTaskNotFinished(status string) bool {
  342. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  343. return true
  344. }
  345. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  346. return true
  347. }
  348. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  349. return true
  350. }
  351. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  352. return true
  353. }
  354. return false
  355. }
  356. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  357. t := time.Now()
  358. ctx.Data["PageIsCloudBrain"] = true
  359. ctx.Data["IsCreate"] = true
  360. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  361. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  362. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  363. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  364. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  365. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  366. ctx.Data["display_job_name"] = displayJobName
  367. prepareCloudbrainOneSpecs(ctx)
  368. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  369. if queuesDetail != nil {
  370. ctx.Data["QueuesDetail"] = queuesDetail
  371. }
  372. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  373. }
  374. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  375. t := time.Now()
  376. ctx.Data["PageIsCloudBrain"] = true
  377. ctx.Data["IsCreate"] = true
  378. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  379. ctx.Data["display_job_name"] = displayJobName
  380. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  381. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  382. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  383. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  384. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  385. var resourcePools modelarts.ResourcePool
  386. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  387. ctx.ServerError("json.Unmarshal failed:", err)
  388. }
  389. ctx.Data["resource_pools"] = resourcePools.Info
  390. var engines modelarts.Engine
  391. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  392. ctx.ServerError("json.Unmarshal failed:", err)
  393. }
  394. ctx.Data["engines"] = engines.Info
  395. var versionInfos modelarts.VersionInfo
  396. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  397. ctx.ServerError("json.Unmarshal failed:", err)
  398. }
  399. ctx.Data["engine_versions"] = versionInfos.Version
  400. prepareCloudbrainTwoInferenceSpecs(ctx)
  401. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  402. ctx.Data["WaitCount"] = waitCount
  403. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  404. }
  405. func AiSafetyCreateForPost(ctx *context.Context) {
  406. ctx.Data["PageIsCloudBrain"] = true
  407. displayJobName := ctx.Query("DisplayJobName")
  408. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  409. taskType := ctx.QueryInt("type")
  410. description := ctx.Query("Description")
  411. ctx.Data["description"] = description
  412. repo := ctx.Repo.Repository
  413. tpname := tplCloudBrainModelSafetyNewNpu
  414. if taskType == models.TypeCloudBrainOne {
  415. tpname = tplCloudBrainModelSafetyNewGpu
  416. }
  417. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  418. if err == nil {
  419. if len(tasks) != 0 {
  420. log.Error("the job name did already exist", ctx.Data["MsgID"])
  421. modelSafetyNewDataPrepare(ctx)
  422. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  423. return
  424. }
  425. } else {
  426. if !models.IsErrJobNotExist(err) {
  427. log.Error("system error, %v", err, ctx.Data["MsgID"])
  428. modelSafetyNewDataPrepare(ctx)
  429. ctx.RenderWithErr("system error", tpname, nil)
  430. return
  431. }
  432. }
  433. if !jobNamePattern.MatchString(jobName) {
  434. modelSafetyNewDataPrepare(ctx)
  435. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  436. return
  437. }
  438. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  439. if err != nil {
  440. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  441. modelSafetyNewDataPrepare(ctx)
  442. ctx.RenderWithErr("system error", tpname, nil)
  443. return
  444. } else {
  445. if count >= 1 {
  446. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  447. modelSafetyNewDataPrepare(ctx)
  448. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  449. return
  450. }
  451. }
  452. BootFile := ctx.Query("BootFile")
  453. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  454. if err != nil || !bootFileExist {
  455. log.Error("Get bootfile error:", err)
  456. modelSafetyNewDataPrepare(ctx)
  457. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  458. return
  459. }
  460. if taskType == models.TypeCloudBrainTwo {
  461. createForNPU(ctx, jobName)
  462. } else if taskType == models.TypeCloudBrainOne {
  463. createForGPU(ctx, jobName)
  464. }
  465. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  466. }
  467. func createForNPU(ctx *context.Context, jobName string) {
  468. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  469. BootFile := ctx.Query("BootFile")
  470. displayJobName := ctx.Query("DisplayJobName")
  471. description := ctx.Query("Description")
  472. engineID := ctx.QueryInt("EngineID")
  473. poolID := ctx.Query("PoolID")
  474. //image := strings.TrimSpace(ctx.Query("Image"))
  475. srcDataset := ctx.Query("srcDataset") //uuid
  476. combatDataset := ctx.Query("combatDataset") //uuid
  477. evaluationIndex := ctx.Query("evaluationIndex")
  478. Params := ctx.Query("RunParaList")
  479. specId := ctx.QueryInt64("SpecId")
  480. repo := ctx.Repo.Repository
  481. trainUrl := ctx.Query("TrainUrl")
  482. modelName := ctx.Query("ModelName")
  483. modelVersion := ctx.Query("ModelVersion")
  484. ckptName := ctx.Query("CkptName")
  485. ckptUrl := "/" + trainUrl + ckptName
  486. log.Info("ckpt url:" + ckptUrl)
  487. FlavorName := ctx.Query("FlavorName")
  488. EngineName := ctx.Query("EngineName")
  489. isLatestVersion := modelarts.IsLatestVersion
  490. VersionCount := modelarts.VersionCountOne
  491. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  492. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  493. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  494. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  495. log.Info("ckpt url:" + ckptUrl)
  496. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  497. JobType: models.JobTypeInference,
  498. ComputeResource: models.NPU,
  499. Cluster: models.OpenICluster,
  500. AiCenterCode: models.AICenterOfCloudBrainTwo})
  501. if err != nil || spec == nil {
  502. modelSafetyNewDataPrepare(ctx)
  503. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  504. return
  505. }
  506. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  507. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  508. modelSafetyNewDataPrepare(ctx)
  509. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  510. return
  511. }
  512. //todo: del the codeLocalPath
  513. _, err = ioutil.ReadDir(codeLocalPath)
  514. if err == nil {
  515. os.RemoveAll(codeLocalPath)
  516. }
  517. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  518. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  519. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  520. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  521. modelSafetyNewDataPrepare(ctx)
  522. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  523. return
  524. }
  525. //todo: upload code (send to file_server todo this work?)
  526. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  527. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  528. modelSafetyNewDataPrepare(ctx)
  529. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  530. return
  531. }
  532. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  533. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  534. modelSafetyNewDataPrepare(ctx)
  535. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  536. return
  537. }
  538. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  539. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  540. modelSafetyNewDataPrepare(ctx)
  541. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  542. return
  543. }
  544. var parameters models.Parameters
  545. param := make([]models.Parameter, 0)
  546. param = append(param, models.Parameter{
  547. Label: modelarts.ResultUrl,
  548. Value: "s3:/" + resultObsPath,
  549. }, models.Parameter{
  550. Label: modelarts.CkptUrl,
  551. Value: "s3:/" + ckptUrl,
  552. })
  553. uuid := srcDataset + ";" + combatDataset
  554. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  555. if err != nil {
  556. modelSafetyNewDataPrepare(ctx)
  557. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  558. return
  559. }
  560. dataPath := dataUrl
  561. jsondatas, err := json.Marshal(datasUrlList)
  562. if err != nil {
  563. log.Error("Failed to Marshal: %v", err)
  564. modelSafetyNewDataPrepare(ctx)
  565. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  566. return
  567. }
  568. if isMultiDataset {
  569. param = append(param, models.Parameter{
  570. Label: modelarts.MultiDataUrl,
  571. Value: string(jsondatas),
  572. })
  573. }
  574. existDeviceTarget := false
  575. if len(Params) != 0 {
  576. err := json.Unmarshal([]byte(Params), &parameters)
  577. if err != nil {
  578. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  579. modelSafetyNewDataPrepare(ctx)
  580. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  581. return
  582. }
  583. for _, parameter := range parameters.Parameter {
  584. if parameter.Label == modelarts.DeviceTarget {
  585. existDeviceTarget = true
  586. }
  587. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  588. param = append(param, models.Parameter{
  589. Label: parameter.Label,
  590. Value: parameter.Value,
  591. })
  592. }
  593. }
  594. }
  595. if !existDeviceTarget {
  596. param = append(param, models.Parameter{
  597. Label: modelarts.DeviceTarget,
  598. Value: modelarts.Ascend,
  599. })
  600. }
  601. req := &modelarts.GenerateInferenceJobReq{
  602. JobName: jobName,
  603. DisplayJobName: displayJobName,
  604. DataUrl: dataPath,
  605. Description: description,
  606. CodeObsPath: codeObsPath,
  607. BootFileUrl: codeObsPath + BootFile,
  608. BootFile: BootFile,
  609. TrainUrl: trainUrl,
  610. WorkServerNumber: 1,
  611. EngineID: int64(engineID),
  612. LogUrl: logObsPath,
  613. PoolID: poolID,
  614. Uuid: uuid,
  615. Parameters: param, //modelarts train parameters
  616. CommitID: commitID,
  617. BranchName: cloudbrain.DefaultBranchName,
  618. Params: Params,
  619. FlavorName: FlavorName,
  620. EngineName: EngineName,
  621. LabelName: evaluationIndex,
  622. IsLatestVersion: isLatestVersion,
  623. VersionCount: VersionCount,
  624. TotalVersionCount: modelarts.TotalVersionCount,
  625. ModelName: modelName,
  626. ModelVersion: modelVersion,
  627. CkptName: ckptName,
  628. ResultUrl: resultObsPath,
  629. Spec: spec,
  630. DatasetName: datasetNames,
  631. JobType: string(models.JobTypeModelSafety),
  632. }
  633. err = modelarts.GenerateInferenceJob(ctx, req)
  634. if err != nil {
  635. log.Error("GenerateTrainJob failed:%v", err.Error())
  636. modelSafetyNewDataPrepare(ctx)
  637. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  638. return
  639. }
  640. }
  641. func createForGPU(ctx *context.Context, jobName string) {
  642. BootFile := ctx.Query("boot_file")
  643. displayJobName := ctx.Query("display_job_name")
  644. description := ctx.Query("description")
  645. image := strings.TrimSpace(ctx.Query("image"))
  646. srcDataset := ctx.Query("src_dataset") //uuid
  647. combatDataset := ctx.Query("combat_dataset") //uuid
  648. evaluationIndex := ctx.Query("evaluationIndex")
  649. Params := ctx.Query("run_para_list")
  650. specId := ctx.QueryInt64("spec_id")
  651. TrainUrl := ctx.Query("train_url")
  652. CkptName := ctx.Query("ckpt_name")
  653. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  654. log.Info("ckpt url:" + ckptUrl)
  655. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  656. JobType: models.JobTypeBenchmark,
  657. ComputeResource: models.GPU,
  658. Cluster: models.OpenICluster,
  659. AiCenterCode: models.AICenterOfCloudBrainOne})
  660. if err != nil || spec == nil {
  661. modelSafetyNewDataPrepare(ctx)
  662. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  663. return
  664. }
  665. repo := ctx.Repo.Repository
  666. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  667. os.RemoveAll(codePath)
  668. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  669. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  670. modelSafetyNewDataPrepare(ctx)
  671. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  672. return
  673. }
  674. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  675. if err != nil {
  676. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  677. modelSafetyNewDataPrepare(ctx)
  678. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  679. return
  680. }
  681. uuid := srcDataset + ";" + combatDataset
  682. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  683. log.Info("uuid=" + uuid)
  684. if err != nil {
  685. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  686. modelSafetyNewDataPrepare(ctx)
  687. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  688. return
  689. }
  690. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  691. if err != nil {
  692. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  693. modelSafetyNewDataPrepare(ctx)
  694. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  695. return
  696. }
  697. log.Info("Command=" + command)
  698. req := cloudbrain.GenerateCloudBrainTaskReq{
  699. Ctx: ctx,
  700. DisplayJobName: displayJobName,
  701. JobName: jobName,
  702. Image: image,
  703. Command: command,
  704. Uuids: uuid,
  705. DatasetNames: datasetNames,
  706. DatasetInfos: datasetInfos,
  707. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  708. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  709. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  710. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  711. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  712. JobType: string(models.JobTypeModelSafety),
  713. Description: description,
  714. BranchName: cloudbrain.DefaultBranchName,
  715. BootFile: BootFile,
  716. Params: Params,
  717. CommitID: "",
  718. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  719. Spec: spec,
  720. LabelName: evaluationIndex,
  721. }
  722. err = cloudbrain.GenerateTask(req)
  723. if err != nil {
  724. modelSafetyNewDataPrepare(ctx)
  725. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  726. return
  727. }
  728. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  729. }
  730. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  731. var command string
  732. bootFile := strings.TrimSpace(BootFile)
  733. if !strings.HasSuffix(bootFile, ".py") {
  734. log.Error("bootFile(%s) format error", bootFile)
  735. return command, errors.New("bootFile format error")
  736. }
  737. var parameters models.Parameters
  738. var param string
  739. if len(params) != 0 {
  740. err := json.Unmarshal([]byte(params), &parameters)
  741. if err != nil {
  742. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  743. return command, err
  744. }
  745. for _, parameter := range parameters.Parameter {
  746. param += " --" + parameter.Label + "=" + parameter.Value
  747. }
  748. }
  749. param += " --modelname" + "=" + CkptName
  750. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  751. return command, nil
  752. }
  753. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  754. ctx.Data["PageIsCloudBrain"] = true
  755. t := time.Now()
  756. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  757. ctx.Data["display_job_name"] = displayJobName
  758. ctx.Data["command"] = cloudbrain.GetCloudbrainDebugCommand()
  759. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  760. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  761. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  762. ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath
  763. ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled
  764. if categories == nil {
  765. json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
  766. }
  767. ctx.Data["benchmark_categories"] = categories.Category
  768. ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
  769. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  770. if queuesDetail != nil {
  771. ctx.Data["QueuesDetail"] = queuesDetail
  772. }
  773. prepareCloudbrainOneSpecs(ctx)
  774. ctx.Data["params"] = ""
  775. ctx.Data["branchName"] = ctx.Repo.BranchName
  776. ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
  777. ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled
  778. ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
  779. ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled
  780. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  781. ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
  782. return nil
  783. }
  784. func getJsonContent(url string) (string, error) {
  785. resp, err := http.Get(url)
  786. if err != nil || resp.StatusCode != 200 {
  787. log.Info("Get organizations url error=" + err.Error())
  788. return "", err
  789. }
  790. bytes, err := ioutil.ReadAll(resp.Body)
  791. resp.Body.Close()
  792. if err != nil {
  793. log.Info("Get organizations url error=" + err.Error())
  794. return "", err
  795. }
  796. str := string(bytes)
  797. //log.Info("json str =" + str)
  798. return str, nil
  799. }