You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 42 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/grampus"
  20. "code.gitea.io/gitea/modules/log"
  21. "code.gitea.io/gitea/modules/modelarts"
  22. "code.gitea.io/gitea/modules/setting"
  23. "code.gitea.io/gitea/modules/storage"
  24. "code.gitea.io/gitea/modules/timeutil"
  25. "code.gitea.io/gitea/modules/util"
  26. "code.gitea.io/gitea/services/cloudbrain/resource"
  27. "code.gitea.io/gitea/services/reward/point/account"
  28. uuid "github.com/satori/go.uuid"
  29. )
  30. const (
  31. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  32. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  33. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  34. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  35. tplModelSafetyTestShow = "repo/modelsafety/show"
  36. )
  37. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  38. log.Info("start to create CloudBrainAiSafetyCreate")
  39. uuid := uuid.NewV4()
  40. id := uuid.String()
  41. seriaNoParas := ctx.Query("serialNo")
  42. fileName := ctx.Query("fileName")
  43. //if jobType == string(models.JobTypeBenchmark) {
  44. req := aisafety.TaskReq{
  45. UnionId: id,
  46. EvalName: "test1",
  47. EvalContent: "test1",
  48. TLPath: "test1",
  49. Indicators: []string{"ACC", "ASS"},
  50. CDName: "CIFAR10_1000_FGSM",
  51. BDName: "CIFAR10_1000基础数据集",
  52. }
  53. aisafety.GetAlgorithmList()
  54. if seriaNoParas != "" {
  55. aisafety.GetTaskStatus(seriaNoParas)
  56. } else {
  57. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  58. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  59. if err == nil {
  60. log.Info("serialNo=" + serialNo)
  61. time.Sleep(time.Duration(2) * time.Second)
  62. aisafety.GetTaskStatus(serialNo)
  63. } else {
  64. log.Info("CreateSafetyTask error," + err.Error())
  65. }
  66. }
  67. }
  68. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  69. if job == nil {
  70. log.Error("GetCloudbrainByJobID failed")
  71. return
  72. }
  73. syncAiSafetyTaskStatus(job)
  74. }
  75. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  76. ctx.Data["id"] = ctx.Params(":jobid")
  77. ctx.HTML(200, tplModelSafetyTestShow)
  78. }
  79. func GetAiSafetyTask(ctx *context.Context) {
  80. var ID = ctx.Params(":jobid")
  81. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  82. if err != nil {
  83. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  84. return
  85. }
  86. syncAiSafetyTaskStatus(job)
  87. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  88. job.BenchmarkType = "安全评测"
  89. job.BenchmarkTypeName = "Image Classification"
  90. ctx.JSON(200, job)
  91. }
  92. func StopAiSafetyTask(ctx *context.Context) {
  93. log.Info("start to stop the task.")
  94. var ID = ctx.Params(":jobid")
  95. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  96. result := make(map[string]interface{})
  97. result["code"] = -1
  98. if err != nil {
  99. log.Info("query task error.err=" + err.Error())
  100. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  101. result["msg"] = "No such task."
  102. ctx.JSON(200, result)
  103. return
  104. }
  105. if isTaskNotFinished(task.Status) {
  106. if task.Type == models.TypeCloudBrainTwo {
  107. log.Info("start to stop model arts task.")
  108. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  109. if err != nil {
  110. log.Info("stop failed.err=" + err.Error())
  111. }
  112. task.Status = string(models.JobStopped)
  113. if task.EndTime == 0 {
  114. task.EndTime = timeutil.TimeStampNow()
  115. }
  116. task.ComputeAndSetDuration()
  117. err = models.UpdateJob(task)
  118. if err != nil {
  119. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  120. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  121. ctx.JSON(200, result)
  122. return
  123. }
  124. //queryTaskStatusFromCloudbrainTwo(job)
  125. } else if task.Type == models.TypeCloudBrainOne {
  126. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  127. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  128. result["msg"] = "cloudbrain.Already_stopped"
  129. ctx.JSON(200, result)
  130. return
  131. }
  132. err := cloudbrain.StopJob(task.JobID)
  133. if err != nil {
  134. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  135. result["msg"] = "cloudbrain.Stopped_failed"
  136. ctx.JSON(200, result)
  137. return
  138. }
  139. task.Status = string(models.JobStopped)
  140. if task.EndTime == 0 {
  141. task.EndTime = timeutil.TimeStampNow()
  142. }
  143. task.ComputeAndSetDuration()
  144. err = models.UpdateJob(task)
  145. if err != nil {
  146. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  147. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  148. ctx.JSON(200, result)
  149. return
  150. }
  151. }
  152. } else {
  153. if task.Status == string(models.ModelSafetyTesting) {
  154. //修改为Failed
  155. task.Status = string(models.JobStopped)
  156. if task.EndTime == 0 {
  157. task.EndTime = timeutil.TimeStampNow()
  158. }
  159. task.ComputeAndSetDuration()
  160. err = models.UpdateJob(task)
  161. if err != nil {
  162. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  163. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  164. ctx.JSON(200, result)
  165. return
  166. }
  167. } else {
  168. log.Info("The job is finished. status=" + task.Status)
  169. }
  170. }
  171. }
  172. func DelAiSafetyTask(ctx *context.Context) {
  173. var ID = ctx.Params(":jobid")
  174. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  175. result := make(map[string]interface{})
  176. result["code"] = 1
  177. if err != nil {
  178. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  179. result["msg"] = "No such task."
  180. ctx.ServerError("No such task.", err)
  181. return
  182. }
  183. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  184. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  185. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  186. ctx.ServerError("the job("+task.JobName+") has not been stopped", nil)
  187. return
  188. }
  189. if task.Type == models.TypeCloudBrainOne {
  190. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  191. }
  192. err = models.DeleteJob(task)
  193. if err != nil {
  194. result["msg"] = err.Error()
  195. ctx.ServerError(err.Error(), err)
  196. return
  197. }
  198. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  199. }
  200. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  201. log.Info("start to query safety task status.")
  202. if isTaskNotFinished(job.Status) {
  203. if job.Type == models.TypeCloudBrainTwo {
  204. queryTaskStatusFromCloudbrainTwo(job)
  205. } else if job.Type == models.TypeCloudBrainOne {
  206. queryTaskStatusFromCloudbrain(job)
  207. } else if job.Type == models.TypeC2Net {
  208. queryTaskStatusFromGrampus(job)
  209. }
  210. } else {
  211. if job.Status == string(models.ModelSafetyTesting) {
  212. queryTaskStatusFromModelSafetyTestServer(job)
  213. } else {
  214. log.Info("The job is finished. status=" + job.Status)
  215. }
  216. }
  217. }
  218. func TimerHandleModelSafetyTestTask() {
  219. log.Info("start to TimerHandleModelSafetyTestTask")
  220. tasks, err := models.GetModelSafetyTestTask()
  221. if err == nil {
  222. if tasks != nil && len(tasks) > 0 {
  223. for _, job := range tasks {
  224. syncAiSafetyTaskStatus(job)
  225. }
  226. } else {
  227. log.Info("query running model safety test task 0.")
  228. }
  229. } else {
  230. log.Info("query running model safety test task err." + err.Error())
  231. }
  232. }
  233. func queryTaskStatusFromGrampus(task *models.Cloudbrain) {
  234. log.Info("The task not finished,name=" + task.DisplayJobName)
  235. if task.DeletedAt.IsZero() { //normal record
  236. result, err := grampus.GetJob(task.JobID)
  237. resultJson, _ := json.Marshal(result)
  238. log.Info("resultJson=" + string(resultJson))
  239. if err != nil {
  240. log.Error("GetJob failed:" + err.Error())
  241. return
  242. }
  243. if result != nil {
  244. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  245. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  246. }
  247. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  248. if task.Status != models.GrampusStatusSucceeded {
  249. if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
  250. task.Duration = result.JobInfo.RunSec
  251. if task.Duration < 0 {
  252. task.Duration = 0
  253. }
  254. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  255. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  256. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  257. }
  258. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  259. task.EndTime = task.StartTime.Add(task.Duration)
  260. }
  261. task.CorrectCreateUnix()
  262. err = models.UpdateJob(task)
  263. if err != nil {
  264. log.Error("UpdateJob failed:" + err.Error())
  265. }
  266. }
  267. } else {
  268. task.Status = string(models.ModelSafetyTesting)
  269. err = models.UpdateJob(task)
  270. if err != nil {
  271. log.Error("UpdateJob failed:", err)
  272. }
  273. //send msg to beihang
  274. sendGPUInferenceResultToTest(task)
  275. }
  276. }
  277. }
  278. }
  279. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  280. log.Info("The task not finished,name=" + job.DisplayJobName)
  281. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  282. if err != nil {
  283. log.Info("query train job error." + err.Error())
  284. return
  285. }
  286. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  287. job.Duration = result.Duration / 1000
  288. job.TrainJobDuration = result.TrainJobDuration
  289. if job.StartTime == 0 && result.StartTime > 0 {
  290. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  291. }
  292. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  293. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  294. job.EndTime = job.StartTime.Add(job.Duration)
  295. }
  296. job.CorrectCreateUnix()
  297. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  298. log.Info("CloudbrainTwo task status=" + job.Status)
  299. err = models.UpdateJob(job)
  300. if err != nil {
  301. log.Error("UpdateJob failed:", err)
  302. }
  303. } else {
  304. log.Info("start to deal ModelSafetyTesting, task status=" + job.Status)
  305. job.Status = string(models.ModelSafetyTesting)
  306. err = models.UpdateJob(job)
  307. if err != nil {
  308. log.Error("UpdateJob failed:", err)
  309. }
  310. //send msg to beihang
  311. sendNPUInferenceResultToTest(job)
  312. }
  313. }
  314. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  315. log.Info("start to sendNPUInferenceResultToTest")
  316. datasetname := job.DatasetName
  317. datasetnames := strings.Split(datasetname, ";")
  318. indicator := job.LabelName
  319. EvalContent := "test1"
  320. if job.Description != "" {
  321. EvalContent = job.Description
  322. }
  323. req := aisafety.TaskReq{
  324. UnionId: job.JobID,
  325. EvalName: job.DisplayJobName,
  326. EvalContent: EvalContent,
  327. TLPath: "test",
  328. Indicators: strings.Split(indicator, ";"),
  329. CDName: datasetnames[1][0 : len(datasetnames[1])-4],
  330. BDName: datasetnames[0][0 : len(datasetnames[1])-4],
  331. }
  332. jsonContent := ""
  333. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  334. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  335. resultPath = resultPath[1:]
  336. log.Info("bucket=" + setting.Bucket + " resultPath=" + resultPath)
  337. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  338. if err != nil {
  339. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  340. } else {
  341. defer body.Close()
  342. var data []byte
  343. p := make([]byte, 4096)
  344. var readErr error
  345. var readCount int
  346. for {
  347. readCount, readErr = body.Read(p)
  348. if readCount > 0 {
  349. data = append(data, p[:readCount]...)
  350. }
  351. if readErr != nil || readCount == 0 {
  352. break
  353. }
  354. }
  355. jsonContent = string(data)
  356. }
  357. if jsonContent != "" {
  358. log.Info("start to send beihang ...")
  359. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  360. if err == nil {
  361. //update serial no to db
  362. job.PreVersionName = serialNo
  363. err = models.UpdateJob(job)
  364. if err != nil {
  365. log.Error("UpdateJob failed:", err)
  366. }
  367. }
  368. } else {
  369. log.Info("The json is null. so set it failed.")
  370. //update task failed.
  371. job.Status = string(models.ModelArtsTrainJobFailed)
  372. err := models.UpdateJob(job)
  373. if err != nil {
  374. log.Error("UpdateJob failed:", err)
  375. }
  376. }
  377. }
  378. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  379. log.Info("The task not finished,name=" + job.DisplayJobName)
  380. jobResult, err := cloudbrain.GetJob(job.JobID)
  381. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  382. if err != nil {
  383. log.Error("ConvertToJobResultPayload failed:", err)
  384. return
  385. }
  386. job.Status = result.JobStatus.State
  387. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  388. taskRoles := result.TaskRoles
  389. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  390. job.Status = taskRes.TaskStatuses[0].State
  391. }
  392. if result.JobStatus.State != string(models.JobSucceeded) {
  393. err = models.UpdateJob(job)
  394. if err != nil {
  395. log.Error("UpdateJob failed:", err)
  396. }
  397. } else {
  398. //
  399. job.Status = string(models.ModelSafetyTesting)
  400. err = models.UpdateJob(job)
  401. if err != nil {
  402. log.Error("UpdateJob failed:", err)
  403. }
  404. //send msg to beihang
  405. sendGPUInferenceResultToTest(job)
  406. }
  407. }
  408. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  409. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  410. if err == nil {
  411. if result.Code == "0" {
  412. if result.Data.Status == 1 {
  413. log.Info("The task is running....")
  414. } else {
  415. if result.Data.Code == 0 {
  416. job.ResultJson = result.Data.StandardJson
  417. err = models.UpdateJob(job)
  418. if err != nil {
  419. log.Error("UpdateJob failed:", err)
  420. }
  421. }
  422. }
  423. } else {
  424. log.Info("The task is failed.")
  425. job.Status = string(models.JobFailed)
  426. err = models.UpdateJob(job)
  427. if err != nil {
  428. log.Error("UpdateJob failed:", err)
  429. }
  430. }
  431. } else {
  432. log.Info("The task not found.....")
  433. }
  434. }
  435. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  436. datasetname := job.DatasetName
  437. datasetnames := strings.Split(datasetname, ";")
  438. indicator := job.LabelName
  439. EvalContent := "test1"
  440. if job.Description != "" {
  441. EvalContent = job.Description
  442. }
  443. req := aisafety.TaskReq{
  444. UnionId: job.JobID,
  445. EvalName: job.DisplayJobName,
  446. EvalContent: EvalContent,
  447. TLPath: "test1",
  448. Indicators: strings.Split(indicator, ";"),
  449. CDName: datasetnames[1],
  450. BDName: datasetnames[0],
  451. }
  452. resultDir := "/model"
  453. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  454. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  455. if err != nil {
  456. log.Error("query cloudbrain one model failed: %v", err)
  457. return
  458. }
  459. jsonContent := ""
  460. for _, file := range files {
  461. if strings.HasSuffix(file.FileName, "result.json") {
  462. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  463. log.Info("path=" + path)
  464. reader, err := os.Open(path)
  465. defer reader.Close()
  466. if err == nil {
  467. r := bufio.NewReader(reader)
  468. for {
  469. line, error := r.ReadString('\n')
  470. if error == io.EOF {
  471. log.Info("read file completed.")
  472. break
  473. }
  474. if error != nil {
  475. log.Info("read file error." + error.Error())
  476. break
  477. }
  478. jsonContent += line
  479. }
  480. }
  481. break
  482. }
  483. }
  484. if jsonContent != "" {
  485. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  486. if err == nil {
  487. //update serial no to db
  488. job.PreVersionName = serialNo
  489. err = models.UpdateJob(job)
  490. if err != nil {
  491. log.Error("UpdateJob failed:", err)
  492. }
  493. }
  494. } else {
  495. log.Info("The json is null. so set it failed.")
  496. //update task failed.
  497. job.Status = string(models.JobFailed)
  498. err = models.UpdateJob(job)
  499. if err != nil {
  500. log.Error("UpdateJob failed:", err)
  501. }
  502. }
  503. }
  504. func isTaskNotFinished(status string) bool {
  505. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  506. return true
  507. }
  508. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  509. return true
  510. }
  511. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  512. return true
  513. }
  514. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  515. return true
  516. }
  517. return false
  518. }
  519. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  520. t := time.Now()
  521. ctx.Data["PageIsCloudBrain"] = true
  522. ctx.Data["IsCreate"] = true
  523. ctx.Data["type"] = models.TypeCloudBrainOne
  524. ctx.Data["compute_resource"] = models.GPUResource
  525. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  526. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  527. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  528. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  529. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  530. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  531. ctx.Data["display_job_name"] = displayJobName
  532. prepareCloudbrainOneSpecs(ctx)
  533. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  534. if queuesDetail != nil {
  535. ctx.Data["QueuesDetail"] = queuesDetail
  536. }
  537. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  538. }
  539. func AiSafetyCreateForGetGrampusGPU(ctx *context.Context) {
  540. ctx.Data["PageIsCloudBrain"] = true
  541. ctx.Data["IsCreate"] = true
  542. ctx.Data["type"] = models.TypeC2Net
  543. ctx.Data["compute_resource"] = models.GPUResource
  544. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  545. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  546. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  547. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  548. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  549. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  550. if err != nil {
  551. ctx.ServerError("get new train-job info failed", err)
  552. return
  553. }
  554. ctx.HTML(200, tplModelSafetyTestCreateGrampusGpu)
  555. }
  556. func AiSafetyCreateForGetGrampusNPU(ctx *context.Context) {
  557. ctx.Data["PageIsCloudBrain"] = true
  558. ctx.Data["IsCreate"] = true
  559. ctx.Data["type"] = models.TypeC2Net
  560. ctx.Data["compute_resource"] = models.NPUResource
  561. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  562. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  563. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  564. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  565. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  566. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  567. if err != nil {
  568. ctx.ServerError("get new train-job info failed", err)
  569. return
  570. }
  571. ctx.HTML(200, tplModelSafetyTestCreateGrampusNpu)
  572. }
  573. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  574. t := time.Now()
  575. ctx.Data["PageIsCloudBrain"] = true
  576. ctx.Data["IsCreate"] = true
  577. ctx.Data["type"] = models.TypeCloudBrainTwo
  578. ctx.Data["compute_resource"] = models.NPUResource
  579. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  580. ctx.Data["display_job_name"] = displayJobName
  581. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  582. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  583. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  584. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  585. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  586. var resourcePools modelarts.ResourcePool
  587. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  588. ctx.ServerError("json.Unmarshal failed:", err)
  589. }
  590. ctx.Data["resource_pools"] = resourcePools.Info
  591. var engines modelarts.Engine
  592. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  593. ctx.ServerError("json.Unmarshal failed:", err)
  594. }
  595. ctx.Data["engines"] = engines.Info
  596. var versionInfos modelarts.VersionInfo
  597. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  598. ctx.ServerError("json.Unmarshal failed:", err)
  599. }
  600. ctx.Data["engine_versions"] = versionInfos.Version
  601. prepareCloudbrainTwoInferenceSpecs(ctx)
  602. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  603. ctx.Data["WaitCount"] = waitCount
  604. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  605. }
  606. func AiSafetyCreateForPost(ctx *context.Context) {
  607. ctx.Data["PageIsCloudBrain"] = true
  608. displayJobName := ctx.Query("display_job_name")
  609. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  610. taskType := ctx.QueryInt("type")
  611. description := ctx.Query("description")
  612. ctx.Data["type"] = taskType
  613. ctx.Data["displayJobName"] = displayJobName
  614. ctx.Data["description"] = description
  615. repo := ctx.Repo.Repository
  616. tpname := tplCloudBrainModelSafetyNewNpu
  617. if taskType == models.TypeCloudBrainOne {
  618. tpname = tplCloudBrainModelSafetyNewGpu
  619. }
  620. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  621. if err == nil {
  622. if len(tasks) != 0 {
  623. log.Error("the job name did already exist", ctx.Data["MsgID"])
  624. modelSafetyNewDataPrepare(ctx)
  625. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  626. return
  627. }
  628. } else {
  629. if !models.IsErrJobNotExist(err) {
  630. log.Error("system error, %v", err, ctx.Data["MsgID"])
  631. modelSafetyNewDataPrepare(ctx)
  632. ctx.RenderWithErr("system error", tpname, nil)
  633. return
  634. }
  635. }
  636. if !jobNamePattern.MatchString(jobName) {
  637. modelSafetyNewDataPrepare(ctx)
  638. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  639. return
  640. }
  641. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  642. if err != nil {
  643. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  644. modelSafetyNewDataPrepare(ctx)
  645. ctx.RenderWithErr("system error", tpname, nil)
  646. return
  647. } else {
  648. if count >= 1 {
  649. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  650. modelSafetyNewDataPrepare(ctx)
  651. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  652. return
  653. }
  654. }
  655. BootFile := ctx.Query("boot_file")
  656. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  657. if err != nil || !bootFileExist {
  658. log.Error("Get bootfile error:", err)
  659. modelSafetyNewDataPrepare(ctx)
  660. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  661. return
  662. }
  663. if taskType == models.TypeCloudBrainTwo {
  664. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  665. createForNPU(ctx, jobName)
  666. } else if taskType == models.TypeCloudBrainOne {
  667. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  668. createForGPU(ctx, jobName)
  669. } else if taskType == models.TypeC2Net {
  670. ComputeResource := ctx.Query("compute_resource")
  671. if ComputeResource == models.NPUResource {
  672. createForGrampusNPU(ctx, jobName)
  673. } else if ComputeResource == models.GPUResource {
  674. createForGrampusGPU(ctx, jobName)
  675. }
  676. }
  677. log.Info("to redirect...")
  678. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  679. }
  680. func createForGrampusGPU(ctx *context.Context, jobName string) {
  681. BootFile := ctx.Query("boot_file")
  682. displayJobName := ctx.Query("display_job_name")
  683. description := ctx.Query("description")
  684. image := strings.TrimSpace(ctx.Query("image"))
  685. srcDataset := ctx.Query("src_dataset") //uuid
  686. combatDataset := ctx.Query("combat_dataset") //uuid
  687. evaluationIndex := ctx.Query("evaluation_index")
  688. Params := ctx.Query("run_para_list")
  689. specId := ctx.QueryInt64("spec_id")
  690. TrainUrl := ctx.Query("train_url")
  691. CkptName := ctx.Query("ckpt_name")
  692. ModelName := ctx.Query("model_name")
  693. ModelVersion := ctx.Query("model_version")
  694. repo := ctx.Repo.Repository
  695. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  696. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  697. //check specification
  698. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  699. JobType: models.JobTypeTrain,
  700. ComputeResource: models.GPU,
  701. Cluster: models.C2NetCluster,
  702. })
  703. if err != nil || spec == nil {
  704. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  705. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  706. return
  707. }
  708. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  709. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  710. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  711. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  712. return
  713. }
  714. //check dataset
  715. uuid := srcDataset + ";" + combatDataset
  716. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  717. if err != nil {
  718. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  719. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  720. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  721. return
  722. }
  723. //prepare code and out path
  724. _, err = ioutil.ReadDir(codeLocalPath)
  725. if err == nil {
  726. os.RemoveAll(codeLocalPath)
  727. }
  728. if err := downloadZipCode(ctx, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  729. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  730. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  731. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  732. return
  733. }
  734. //todo: upload code (send to file_server todo this work?)
  735. //upload code
  736. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  737. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  738. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  739. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  740. return
  741. }
  742. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  743. if err := mkModelPath(modelPath); err != nil {
  744. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  745. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  746. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  747. return
  748. }
  749. //init model readme
  750. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  751. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  752. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  753. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  754. return
  755. }
  756. var datasetRemotePath, allFileName string
  757. for _, datasetInfo := range datasetInfos {
  758. if datasetRemotePath == "" {
  759. datasetRemotePath = datasetInfo.DataLocalPath
  760. allFileName = datasetInfo.FullName
  761. } else {
  762. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  763. allFileName = allFileName + ";" + datasetInfo.FullName
  764. }
  765. }
  766. //prepare command
  767. preTrainModelPath := getPreTrainModelPath(TrainUrl, CkptName)
  768. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, BootFile, Params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, CkptName)
  769. if err != nil {
  770. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  771. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  772. ctx.RenderWithErr("Create task failed, internal error", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  773. return
  774. }
  775. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  776. req := &grampus.GenerateTrainJobReq{
  777. JobName: jobName,
  778. DisplayJobName: displayJobName,
  779. ComputeResource: models.GPUResource,
  780. ProcessType: grampus.ProcessorTypeGPU,
  781. Command: command,
  782. ImageUrl: image,
  783. Description: description,
  784. BootFile: BootFile,
  785. Uuid: uuid,
  786. CommitID: commitID,
  787. BranchName: cloudbrain.DefaultBranchName,
  788. Params: Params,
  789. EngineName: image,
  790. DatasetNames: datasetNames,
  791. DatasetInfos: datasetInfos,
  792. IsLatestVersion: modelarts.IsLatestVersion,
  793. VersionCount: modelarts.VersionCountOne,
  794. WorkServerNumber: 1,
  795. Spec: spec,
  796. ModelName: ModelName,
  797. LabelName: evaluationIndex,
  798. CkptName: CkptName,
  799. ModelVersion: ModelVersion,
  800. PreTrainModelUrl: TrainUrl,
  801. }
  802. err = grampus.GenerateTrainJob(ctx, req)
  803. if err != nil {
  804. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  805. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  806. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  807. return
  808. }
  809. }
  810. func createForGrampusNPU(ctx *context.Context, jobName string) {
  811. }
  812. func createForNPU(ctx *context.Context, jobName string) {
  813. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  814. BootFile := ctx.Query("boot_file")
  815. displayJobName := ctx.Query("display_job_name")
  816. description := ctx.Query("description")
  817. srcDataset := ctx.Query("src_dataset") //uuid
  818. combatDataset := ctx.Query("combat_dataset") //uuid
  819. evaluationIndex := ctx.Query("evaluation_index")
  820. Params := ctx.Query("run_para_list")
  821. specId := ctx.QueryInt64("spec_id")
  822. engineID := ctx.QueryInt("engine_id")
  823. log.Info("engine_id=" + fmt.Sprint(engineID))
  824. poolID := ctx.Query("pool_id")
  825. repo := ctx.Repo.Repository
  826. trainUrl := ctx.Query("train_url")
  827. modelName := ctx.Query("model_name")
  828. modelVersion := ctx.Query("model_version")
  829. ckptName := ctx.Query("ckpt_name")
  830. ckptUrl := "/" + trainUrl + ckptName
  831. log.Info("ckpt url:" + ckptUrl)
  832. FlavorName := ctx.Query("flaver_names")
  833. EngineName := ctx.Query("engine_names")
  834. isLatestVersion := modelarts.IsLatestVersion
  835. VersionCount := modelarts.VersionCountOne
  836. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  837. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  838. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  839. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  840. log.Info("ckpt url:" + ckptUrl)
  841. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  842. JobType: models.JobTypeInference,
  843. ComputeResource: models.NPU,
  844. Cluster: models.OpenICluster,
  845. AiCenterCode: models.AICenterOfCloudBrainTwo})
  846. if err != nil || spec == nil {
  847. modelSafetyNewDataPrepare(ctx)
  848. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  849. return
  850. }
  851. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  852. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  853. modelSafetyNewDataPrepare(ctx)
  854. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  855. return
  856. }
  857. //todo: del the codeLocalPath
  858. _, err = ioutil.ReadDir(codeLocalPath)
  859. if err == nil {
  860. os.RemoveAll(codeLocalPath)
  861. }
  862. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  863. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  864. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  865. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  866. modelSafetyNewDataPrepare(ctx)
  867. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  868. return
  869. }
  870. //todo: upload code (send to file_server todo this work?)
  871. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  872. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  873. modelSafetyNewDataPrepare(ctx)
  874. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  875. return
  876. }
  877. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  878. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  879. modelSafetyNewDataPrepare(ctx)
  880. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  881. return
  882. }
  883. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  884. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  885. modelSafetyNewDataPrepare(ctx)
  886. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  887. return
  888. }
  889. var parameters models.Parameters
  890. param := make([]models.Parameter, 0)
  891. param = append(param, models.Parameter{
  892. Label: modelarts.ResultUrl,
  893. Value: "s3:/" + resultObsPath,
  894. }, models.Parameter{
  895. Label: modelarts.CkptUrl,
  896. Value: "s3:/" + ckptUrl,
  897. })
  898. uuid := srcDataset + ";" + combatDataset
  899. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  900. if err != nil {
  901. modelSafetyNewDataPrepare(ctx)
  902. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  903. return
  904. }
  905. dataPath := dataUrl
  906. jsondatas, err := json.Marshal(datasUrlList)
  907. if err != nil {
  908. log.Error("Failed to Marshal: %v", err)
  909. modelSafetyNewDataPrepare(ctx)
  910. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  911. return
  912. }
  913. if isMultiDataset {
  914. param = append(param, models.Parameter{
  915. Label: modelarts.MultiDataUrl,
  916. Value: string(jsondatas),
  917. })
  918. }
  919. existDeviceTarget := false
  920. if len(Params) != 0 {
  921. err := json.Unmarshal([]byte(Params), &parameters)
  922. if err != nil {
  923. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  924. modelSafetyNewDataPrepare(ctx)
  925. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  926. return
  927. }
  928. for _, parameter := range parameters.Parameter {
  929. if parameter.Label == modelarts.DeviceTarget {
  930. existDeviceTarget = true
  931. }
  932. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  933. param = append(param, models.Parameter{
  934. Label: parameter.Label,
  935. Value: parameter.Value,
  936. })
  937. }
  938. }
  939. }
  940. if !existDeviceTarget {
  941. param = append(param, models.Parameter{
  942. Label: modelarts.DeviceTarget,
  943. Value: modelarts.Ascend,
  944. })
  945. }
  946. req := &modelarts.GenerateInferenceJobReq{
  947. JobName: jobName,
  948. DisplayJobName: displayJobName,
  949. DataUrl: dataPath,
  950. Description: description,
  951. CodeObsPath: codeObsPath,
  952. BootFileUrl: codeObsPath + BootFile,
  953. BootFile: BootFile,
  954. TrainUrl: trainUrl,
  955. WorkServerNumber: 1,
  956. EngineID: int64(engineID),
  957. LogUrl: logObsPath,
  958. PoolID: poolID,
  959. Uuid: uuid,
  960. Parameters: param, //modelarts train parameters
  961. CommitID: commitID,
  962. BranchName: cloudbrain.DefaultBranchName,
  963. Params: Params,
  964. FlavorName: FlavorName,
  965. EngineName: EngineName,
  966. LabelName: evaluationIndex,
  967. IsLatestVersion: isLatestVersion,
  968. VersionCount: VersionCount,
  969. TotalVersionCount: modelarts.TotalVersionCount,
  970. ModelName: modelName,
  971. ModelVersion: modelVersion,
  972. CkptName: ckptName,
  973. ResultUrl: resultObsPath,
  974. Spec: spec,
  975. DatasetName: datasetNames,
  976. JobType: string(models.JobTypeModelSafety),
  977. }
  978. err = modelarts.GenerateInferenceJob(ctx, req)
  979. if err != nil {
  980. log.Error("GenerateTrainJob failed:%v", err.Error())
  981. modelSafetyNewDataPrepare(ctx)
  982. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  983. return
  984. }
  985. }
  986. func createForGPU(ctx *context.Context, jobName string) {
  987. BootFile := ctx.Query("boot_file")
  988. displayJobName := ctx.Query("display_job_name")
  989. description := ctx.Query("description")
  990. image := strings.TrimSpace(ctx.Query("image"))
  991. srcDataset := ctx.Query("src_dataset") //uuid
  992. combatDataset := ctx.Query("combat_dataset") //uuid
  993. evaluationIndex := ctx.Query("evaluation_index")
  994. Params := ctx.Query("run_para_list")
  995. specId := ctx.QueryInt64("spec_id")
  996. TrainUrl := ctx.Query("train_url")
  997. CkptName := ctx.Query("ckpt_name")
  998. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  999. log.Info("ckpt url:" + ckptUrl)
  1000. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  1001. JobType: models.JobTypeBenchmark,
  1002. ComputeResource: models.GPU,
  1003. Cluster: models.OpenICluster,
  1004. AiCenterCode: models.AICenterOfCloudBrainOne})
  1005. if err != nil || spec == nil {
  1006. modelSafetyNewDataPrepare(ctx)
  1007. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  1008. return
  1009. }
  1010. repo := ctx.Repo.Repository
  1011. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  1012. os.RemoveAll(codePath)
  1013. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  1014. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  1015. modelSafetyNewDataPrepare(ctx)
  1016. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  1017. return
  1018. }
  1019. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  1020. if err != nil {
  1021. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  1022. modelSafetyNewDataPrepare(ctx)
  1023. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  1024. return
  1025. }
  1026. uuid := srcDataset + ";" + combatDataset
  1027. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  1028. log.Info("uuid=" + uuid)
  1029. if err != nil {
  1030. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  1031. modelSafetyNewDataPrepare(ctx)
  1032. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  1033. return
  1034. }
  1035. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  1036. if err != nil {
  1037. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  1038. modelSafetyNewDataPrepare(ctx)
  1039. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  1040. return
  1041. }
  1042. log.Info("Command=" + command)
  1043. req := cloudbrain.GenerateCloudBrainTaskReq{
  1044. Ctx: ctx,
  1045. DisplayJobName: displayJobName,
  1046. JobName: jobName,
  1047. Image: image,
  1048. Command: command,
  1049. Uuids: uuid,
  1050. DatasetNames: datasetNames,
  1051. DatasetInfos: datasetInfos,
  1052. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  1053. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  1054. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  1055. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  1056. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  1057. JobType: string(models.JobTypeModelSafety),
  1058. Description: description,
  1059. BranchName: cloudbrain.DefaultBranchName,
  1060. BootFile: BootFile,
  1061. Params: Params,
  1062. CommitID: "",
  1063. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  1064. Spec: spec,
  1065. LabelName: evaluationIndex,
  1066. }
  1067. err = cloudbrain.GenerateTask(req)
  1068. if err != nil {
  1069. modelSafetyNewDataPrepare(ctx)
  1070. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  1071. return
  1072. }
  1073. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  1074. }
  1075. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  1076. var command string
  1077. bootFile := strings.TrimSpace(BootFile)
  1078. if !strings.HasSuffix(bootFile, ".py") {
  1079. log.Error("bootFile(%s) format error", bootFile)
  1080. return command, errors.New("bootFile format error")
  1081. }
  1082. var parameters models.Parameters
  1083. var param string
  1084. if len(params) != 0 {
  1085. err := json.Unmarshal([]byte(params), &parameters)
  1086. if err != nil {
  1087. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1088. return command, err
  1089. }
  1090. for _, parameter := range parameters.Parameter {
  1091. param += " --" + parameter.Label + "=" + parameter.Value
  1092. }
  1093. }
  1094. param += " --modelname" + "=" + CkptName
  1095. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  1096. return command, nil
  1097. }
  1098. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  1099. ctx.Data["PageIsCloudBrain"] = true
  1100. ctx.Data["boot_file"] = ctx.Query("boot_file")
  1101. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  1102. ctx.Data["description"] = ctx.Query("description")
  1103. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  1104. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  1105. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  1106. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  1107. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  1108. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  1109. ctx.Data["train_url"] = ctx.Query("train_url")
  1110. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1111. ctx.Data["train_url"] = ctx.Query("train_url")
  1112. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1113. ctx.Data["model_name"] = ctx.Query("model_name")
  1114. ctx.Data["model_version"] = ctx.Query("model_version")
  1115. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  1116. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  1117. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  1118. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  1119. prepareCloudbrainOneSpecs(ctx)
  1120. prepareCloudbrainTwoInferenceSpecs(ctx)
  1121. return nil
  1122. }
  1123. func getJsonContent(url string) (string, error) {
  1124. resp, err := http.Get(url)
  1125. if err != nil || resp.StatusCode != 200 {
  1126. log.Info("Get organizations url error=" + err.Error())
  1127. return "", err
  1128. }
  1129. bytes, err := ioutil.ReadAll(resp.Body)
  1130. resp.Body.Close()
  1131. if err != nil {
  1132. log.Info("Get organizations url error=" + err.Error())
  1133. return "", err
  1134. }
  1135. str := string(bytes)
  1136. //log.Info("json str =" + str)
  1137. return str, nil
  1138. }