You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 32 kB

4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  27. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  28. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  29. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  30. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  31. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  32. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  33. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  34. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  35. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  36. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  37. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  38. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  39. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  40. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  41. )
  42. type Cloudbrain struct {
  43. ID int64 `xorm:"pk autoincr"`
  44. JobID string `xorm:"INDEX NOT NULL"`
  45. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  46. JobName string `xorm:"INDEX"`
  47. Status string `xorm:"INDEX"`
  48. UserID int64 `xorm:"INDEX"`
  49. RepoID int64 `xorm:"INDEX"`
  50. SubTaskName string `xorm:"INDEX"`
  51. ContainerID string
  52. ContainerIp string
  53. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  54. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  55. Duration int64 `xorm:"INDEX duration"`
  56. TrainJobDuration string
  57. DeletedAt time.Time `xorm:"deleted"`
  58. CanDebug bool `xorm:"-"`
  59. CanDel bool `xorm:"-"`
  60. Type int `xorm:"INDEX DEFAULT 0"`
  61. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  62. VersionName string
  63. Uuid string
  64. DatasetName string
  65. User *User `xorm:"-"`
  66. Repo *Repository `xorm:"-"`
  67. }
  68. type CloudbrainInfo struct {
  69. Cloudbrain `xorm:"extends"`
  70. User `xorm:"extends"`
  71. }
  72. type CloudBrainLoginResult struct {
  73. Code string
  74. Msg string
  75. Payload map[string]interface{}
  76. }
  77. type TaskRole struct {
  78. Name string `json:"name"`
  79. TaskNumber int `json:"taskNumber"`
  80. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  81. MinFailedTaskCount int `json:"minFailedTaskCount"`
  82. CPUNumber int `json:"cpuNumber"`
  83. GPUNumber int `json:"gpuNumber"`
  84. MemoryMB int `json:"memoryMB"`
  85. ShmMB int `json:"shmMB"`
  86. Command string `json:"command"`
  87. NeedIBDevice bool `json:"needIBDevice"`
  88. IsMainRole bool `json:"isMainRole"`
  89. UseNNI bool `json:"useNNI"`
  90. }
  91. type StHostPath struct {
  92. Path string `json:"path"`
  93. MountPath string `json:"mountPath"`
  94. ReadOnly bool `json:"readOnly"`
  95. }
  96. type Volume struct {
  97. HostPath StHostPath `json:"hostPath"`
  98. }
  99. type CreateJobParams struct {
  100. JobName string `json:"jobName"`
  101. RetryCount int8 `json:"retryCount"`
  102. GpuType string `json:"gpuType"`
  103. Image string `json:"image"`
  104. TaskRoles []TaskRole `json:"taskRoles"`
  105. Volumes []Volume `json:"volumes"`
  106. }
  107. type CreateJobResult struct {
  108. Code string `json:"code"`
  109. Msg string `json:"msg"`
  110. Payload map[string]interface{} `json:"payload"`
  111. }
  112. type GetJobResult struct {
  113. Code string `json:"code"`
  114. Msg string `json:"msg"`
  115. Payload map[string]interface{} `json:"payload"`
  116. }
  117. type GetImagesResult struct {
  118. Code string `json:"code"`
  119. Msg string `json:"msg"`
  120. Payload GetImagesPayload `json:"payload"`
  121. }
  122. type GetImagesPayload struct {
  123. Count int `json:"count"`
  124. TotalPages int `json:"totalPages,omitempty"`
  125. ImageInfo []*ImageInfo `json:"rows"`
  126. }
  127. type CloudbrainsOptions struct {
  128. ListOptions
  129. RepoID int64 // include all repos if empty
  130. UserID int64
  131. JobID int64
  132. SortType string
  133. CloudbrainIDs []int64
  134. // JobStatus CloudbrainStatus
  135. Type int
  136. }
  137. type TaskPod struct {
  138. TaskRoleStatus struct {
  139. Name string `json:"name"`
  140. } `json:"taskRoleStatus"`
  141. //TaskStatuses []struct {
  142. // TaskIndex int `json:"taskIndex"`
  143. // PodUID string `json:"podUid"`
  144. // PodIP string `json:"podIp"`
  145. // PodName string `json:"podName"`
  146. // ContainerID string `json:"containerId"`
  147. // ContainerIP string `json:"containerIp"`
  148. // ContainerGpus string `json:"containerGpus"`
  149. // State string `json:"state"`
  150. // StartAt time.Time `json:"startAt"`
  151. // FinishedAt time.Time `json:"finishedAt"`
  152. // ExitCode int `json:"exitCode"`
  153. // ExitDiagnostics string `json:"exitDiagnostics"`
  154. // RetriedCount int `json:"retriedCount"`
  155. // StartTime string
  156. // FinishedTime string
  157. //} `json:"taskStatuses"`
  158. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  159. }
  160. type TaskStatuses struct {
  161. TaskIndex int `json:"taskIndex"`
  162. PodUID string `json:"podUid"`
  163. PodIP string `json:"podIp"`
  164. PodName string `json:"podName"`
  165. ContainerID string `json:"containerId"`
  166. ContainerIP string `json:"containerIp"`
  167. ContainerGpus string `json:"containerGpus"`
  168. State string `json:"state"`
  169. StartAt time.Time `json:"startAt"`
  170. FinishedAt time.Time `json:"finishedAt"`
  171. ExitCode int `json:"exitCode"`
  172. ExitDiagnostics string `json:"exitDiagnostics"`
  173. RetriedCount int `json:"retriedCount"`
  174. StartTime string
  175. FinishedTime string
  176. }
  177. type TaskInfo struct {
  178. Username string `json:"username"`
  179. TaskName string `json:"task_name"`
  180. CodeName string `json:"code_name"`
  181. BenchmarkCategory []string `json:"selected_category"`
  182. CodeLink string `json:"code_link"`
  183. GpuType string `json:"gpu_type"`
  184. }
  185. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  186. data, _ := json.Marshal(input)
  187. var taskPod TaskPod
  188. err := json.Unmarshal(data, &taskPod)
  189. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  190. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  191. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  192. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  193. taskPod.TaskStatuses[0].FinishedTime = "-"
  194. }
  195. return taskPod, err
  196. }
  197. type JobResultPayload struct {
  198. ID string `json:"id"`
  199. Name string `json:"name"`
  200. Platform string `json:"platform"`
  201. JobStatus struct {
  202. Username string `json:"username"`
  203. State string `json:"state"`
  204. SubState string `json:"subState"`
  205. ExecutionType string `json:"executionType"`
  206. Retries int `json:"retries"`
  207. CreatedTime int64 `json:"createdTime"`
  208. CompletedTime int64 `json:"completedTime"`
  209. AppID string `json:"appId"`
  210. AppProgress string `json:"appProgress"`
  211. AppTrackingURL string `json:"appTrackingUrl"`
  212. AppLaunchedTime int64 `json:"appLaunchedTime"`
  213. AppCompletedTime interface{} `json:"appCompletedTime"`
  214. AppExitCode int `json:"appExitCode"`
  215. AppExitDiagnostics string `json:"appExitDiagnostics"`
  216. AppExitType interface{} `json:"appExitType"`
  217. VirtualCluster string `json:"virtualCluster"`
  218. StartTime string
  219. EndTime string
  220. } `json:"jobStatus"`
  221. TaskRoles map[string]interface{} `json:"taskRoles"`
  222. Resource struct {
  223. CPU int `json:"cpu"`
  224. Memory string `json:"memory"`
  225. NvidiaComGpu int `json:"nvidia.com/gpu"`
  226. } `json:"resource"`
  227. Config struct {
  228. Image string `json:"image"`
  229. JobID string `json:"jobId"`
  230. GpuType string `json:"gpuType"`
  231. JobName string `json:"jobName"`
  232. JobType string `json:"jobType"`
  233. TaskRoles []struct {
  234. Name string `json:"name"`
  235. ShmMB int `json:"shmMB"`
  236. Command string `json:"command"`
  237. MemoryMB int `json:"memoryMB"`
  238. CPUNumber int `json:"cpuNumber"`
  239. GpuNumber int `json:"gpuNumber"`
  240. IsMainRole bool `json:"isMainRole"`
  241. TaskNumber int `json:"taskNumber"`
  242. NeedIBDevice bool `json:"needIBDevice"`
  243. MinFailedTaskCount int `json:"minFailedTaskCount"`
  244. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  245. } `json:"taskRoles"`
  246. RetryCount int `json:"retryCount"`
  247. } `json:"config"`
  248. Userinfo struct {
  249. User string `json:"user"`
  250. OrgID string `json:"org_id"`
  251. } `json:"userinfo"`
  252. }
  253. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  254. data, _ := json.Marshal(input)
  255. var jobResultPayload JobResultPayload
  256. err := json.Unmarshal(data, &jobResultPayload)
  257. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  258. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  259. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  260. jobResultPayload.JobStatus.StartTime = "-"
  261. jobResultPayload.JobStatus.EndTime = "-"
  262. }
  263. return jobResultPayload, err
  264. }
  265. type ImagesResultPayload struct {
  266. Images []struct {
  267. ID int `json:"id"`
  268. Name string `json:"name"`
  269. Place string `json:"place"`
  270. Description string `json:"description"`
  271. Provider string `json:"provider"`
  272. Createtime string `json:"createtime"`
  273. Remark string `json:"remark"`
  274. } `json:"taskStatuses"`
  275. }
  276. type ImageInfo struct {
  277. ID int `json:"id"`
  278. Name string `json:"name"`
  279. Place string `json:"place"`
  280. Description string `json:"description"`
  281. Provider string `json:"provider"`
  282. Createtime string `json:"createtime"`
  283. Remark string `json:"remark"`
  284. IsPublic int `json:"isPublic"`
  285. PlaceView string
  286. }
  287. type Categories struct {
  288. Category []*Category `json:"category"`
  289. }
  290. type Category struct {
  291. Id int `json:"id"`
  292. Value string `json:"value"`
  293. }
  294. type GpuInfos struct {
  295. GpuInfo []*GpuInfo `json:"gpu_type"`
  296. }
  297. type GpuInfo struct {
  298. Id int `json:"id"`
  299. Value string `json:"value"`
  300. Queue string `json:"queue"`
  301. }
  302. type ResourceSpecs struct {
  303. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  304. }
  305. type ResourceSpec struct {
  306. Id int `json:"id"`
  307. CpuNum int `json:"cpu"`
  308. GpuNum int `json:"gpu"`
  309. MemMiB int `json:"memMiB"`
  310. ShareMemMiB int `json:"shareMemMiB"`
  311. }
  312. type FlavorInfos struct {
  313. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  314. }
  315. type FlavorInfo struct {
  316. Id int `json:"id"`
  317. Value string `json:"value"`
  318. }
  319. type PoolInfos struct {
  320. PoolInfo []*PoolInfo `json:"pool_info"`
  321. }
  322. type PoolInfo struct {
  323. PoolId string `json:"pool_id"`
  324. PoolName string `json:"pool_name"`
  325. PoolType string `json:"pool_type"`
  326. }
  327. type CommitImageParams struct {
  328. Ip string `json:"ip"`
  329. TaskContainerId string `json:"taskContainerId"`
  330. ImageTag string `json:"imageTag"`
  331. ImageDescription string `json:"imageDescription"`
  332. }
  333. type CommitImageResult struct {
  334. Code string `json:"code"`
  335. Msg string `json:"msg"`
  336. Payload map[string]interface{} `json:"payload"`
  337. }
  338. type CloudBrainResult struct {
  339. Code string `json:"code"`
  340. Msg string `json:"msg"`
  341. }
  342. type CreateNotebookParams struct {
  343. JobName string `json:"name"`
  344. Description string `json:"description"`
  345. ProfileID string `json:"profile_id"`
  346. Flavor string `json:"flavor"`
  347. Spec Spec `json:"spec"`
  348. Workspace Workspace `json:"workspace"`
  349. Pool Pool `json:"pool"`
  350. }
  351. type Pool struct {
  352. ID string `json:"id"`
  353. Name string `json:"name"`
  354. Type string `json:"type"`
  355. }
  356. type Workspace struct {
  357. ID string `json:"id"`
  358. }
  359. type Spec struct {
  360. Storage Storage `json:"storage"`
  361. AutoStop AutoStop `json:"auto_stop"`
  362. }
  363. type AutoStop struct {
  364. Enable bool `json:"enable"`
  365. Duration int `json:"duration"`
  366. }
  367. type Storage struct {
  368. Type string `json:"type"`
  369. Location Location `json:"location"`
  370. }
  371. type Location struct {
  372. Path string `json:"path"`
  373. }
  374. type NotebookResult struct {
  375. ErrorCode string `json:"error_code"`
  376. ErrorMsg string `json:"error_msg"`
  377. }
  378. type CreateNotebookResult struct {
  379. ErrorCode string `json:"error_code"`
  380. ErrorMsg string `json:"error_msg"`
  381. ID string `json:"id"`
  382. Name string `json:"name"`
  383. Description string `json:"description"`
  384. Status string `json:"status"`
  385. CreationTimestamp string `json:"creation_timestamp"`
  386. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  387. Profile struct {
  388. ID string `json:"id"`
  389. Name string `json:"name"`
  390. Description string `json:"description"`
  391. DeType string `json:"de_type"`
  392. FlavorType string `json:"flavor_type"`
  393. } `json:"profile"`
  394. Flavor string `json:"flavor"`
  395. FlavorDetails struct {
  396. Name string `json:"name"`
  397. Status string `json:"status"`
  398. QueuingNum int `json:"queuing_num"`
  399. QueueLeftTime int `json:"queue_left_time"` //s
  400. Duration int `json:"duration"` //auto_stop_time s
  401. } `json:"flavor_details"`
  402. }
  403. type GetNotebookResult struct {
  404. ErrorCode string `json:"error_code"`
  405. ErrorMsg string `json:"error_msg"`
  406. ID string `json:"id"`
  407. Name string `json:"name"`
  408. Description string `json:"description"`
  409. Status string `json:"status"`
  410. CreationTimestamp string `json:"creation_timestamp"`
  411. CreateTime string
  412. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  413. LatestUpdateTime string
  414. Profile struct {
  415. ID string `json:"id"`
  416. Name string `json:"name"`
  417. Description string `json:"description"`
  418. DeType string `json:"de_type"`
  419. FlavorType string `json:"flavor_type"`
  420. } `json:"profile"`
  421. Flavor string `json:"flavor"`
  422. FlavorDetails struct {
  423. Name string `json:"name"`
  424. Status string `json:"status"`
  425. QueuingNum int `json:"queuing_num"`
  426. QueueLeftTime int `json:"queue_left_time"` //s
  427. Duration int `json:"duration"` //auto_stop_time s
  428. } `json:"flavor_details"`
  429. QueuingInfo struct {
  430. ID string `json:"id"`
  431. Name string `json:"name"`
  432. Flavor string `json:"flavor"`
  433. DeType string `json:"de_type"`
  434. Status string `json:"status"`
  435. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  436. BeginTime string
  437. RemainTime int `json:"remain_time"` //remain time of instance
  438. EndTimestamp int `json:"end_timestamp"` //
  439. EndTime string
  440. Rank int `json:"rank"` //rank of instance in queue
  441. } `json:"queuing_info"`
  442. Spec struct {
  443. Annotations struct {
  444. TargetDomain string `json:"target_domain"`
  445. Url string `json:"url"`
  446. } `json:"annotations"`
  447. } `json:"spec"`
  448. }
  449. type GetTokenParams struct {
  450. Auth Auth `json:"auth"`
  451. }
  452. type Auth struct {
  453. Identity Identity `json:"identity"`
  454. Scope Scope `json:"scope"`
  455. }
  456. type Scope struct {
  457. Project Project `json:"project"`
  458. }
  459. type Project struct {
  460. Name string `json:"name"`
  461. }
  462. type Identity struct {
  463. Methods []string `json:"methods"`
  464. Password Password `json:"password"`
  465. }
  466. type Password struct {
  467. User NotebookUser `json:"user"`
  468. }
  469. type NotebookUser struct {
  470. Name string `json:"name"`
  471. Password string `json:"password"`
  472. Domain Domain `json:"domain"`
  473. }
  474. type Domain struct {
  475. Name string `json:"name"`
  476. }
  477. const (
  478. ActionStart = "start"
  479. ActionStop = "stop"
  480. ActionRestart = "restart"
  481. ActionQueue = "queue"
  482. ActionDequeue = "dequeue"
  483. )
  484. type NotebookAction struct {
  485. Action string `json:"action"`
  486. }
  487. type NotebookActionResult struct {
  488. ErrorCode string `json:"error_code"`
  489. ErrorMsg string `json:"error_msg"`
  490. CurrentStatus string `json:"current_status"`
  491. PreviousState string `json:"previous_state"`
  492. }
  493. type NotebookGetJobTokenResult struct {
  494. ErrorCode string `json:"error_code"`
  495. ErrorMsg string `json:"error_msg"`
  496. Token string `json:"token"`
  497. }
  498. type NotebookDelResult struct {
  499. InstanceID string `json:"instance_id"`
  500. }
  501. type CreateTrainJobParams struct {
  502. JobName string `json:"job_name"`
  503. Description string `json:"job_desc"`
  504. Config Config `json:"config"`
  505. WorkspaceID string `json:"workspace_id"`
  506. }
  507. type Config struct {
  508. WorkServerNum int `json:"worker_server_num"`
  509. AppUrl string `json:"app_url"` //训练作业的代码目录
  510. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  511. Parameter []Parameter `json:"parameter"`
  512. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  513. //DatasetID string `json:"dataset_id"`
  514. //DataVersionID string `json:"dataset_version_id"`
  515. //DataSource []DataSource `json:"data_source"`
  516. //SpecID int64 `json:"spec_id"`
  517. EngineID int64 `json:"engine_id"`
  518. //ModelID int64 `json:"model_id"`
  519. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  520. LogUrl string `json:"log_url"`
  521. //UserImageUrl string `json:"user_image_url"`
  522. //UserCommand string `json:"user_command"`
  523. CreateVersion bool `json:"create_version"`
  524. //Volumes []Volumes `json:"volumes"`
  525. Flavor Flavor `json:"flavor"`
  526. PoolID string `json:"pool_id"`
  527. }
  528. type CreateConfigParams struct {
  529. ConfigName string `json:"config_name"`
  530. Description string `json:"config_desc"`
  531. WorkServerNum int `json:"worker_server_num"`
  532. AppUrl string `json:"app_url"` //训练作业的代码目录
  533. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  534. Parameter []Parameter `json:"parameter"`
  535. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  536. //DatasetID string `json:"dataset_id"`
  537. //DataVersionID string `json:"dataset_version_id"`
  538. //DataSource []DataSource `json:"data_source"`
  539. //SpecID int64 `json:"spec_id"`
  540. EngineID int64 `json:"engine_id"`
  541. //ModelID int64 `json:"model_id"`
  542. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  543. LogUrl string `json:"log_url"`
  544. //UserImageUrl string `json:"user_image_url"`
  545. //UserCommand string `json:"user_command"`
  546. //CreateVersion bool `json:"create_version"`
  547. //Volumes []Volumes `json:"volumes"`
  548. Flavor Flavor `json:"flavor"`
  549. PoolID string `json:"pool_id"`
  550. }
  551. type Parameter struct {
  552. Label string `json:"label"`
  553. Value string `json:"value"`
  554. }
  555. type Parameters struct {
  556. Parameter []Parameter `json:"parameter"`
  557. }
  558. type DataSource struct {
  559. DatasetID string `json:"dataset_id"`
  560. DatasetVersion string `json:"dataset_version"`
  561. Type string `json:"type"`
  562. DataUrl string `json:"data_url"`
  563. }
  564. type Volumes struct {
  565. Nfs Nfs `json:"nfs"`
  566. HostPath HostPath `json:"host_path"`
  567. }
  568. type Nfs struct {
  569. ID string `json:"id"`
  570. SourcePath string `json:"src_path"`
  571. DestPath string `json:"dest_path"`
  572. ReadOnly bool `json:"read_only"`
  573. }
  574. type HostPath struct {
  575. SourcePath string `json:"src_path"`
  576. DestPath string `json:"dest_path"`
  577. ReadOnly bool `json:"read_only"`
  578. }
  579. type Flavor struct {
  580. Code string `json:"code"`
  581. }
  582. type CreateTrainJobResult struct {
  583. ErrorCode string `json:"error_code"`
  584. ErrorMsg string `json:"error_msg"`
  585. IsSuccess bool `json:"is_success"`
  586. JobName string `json:"job_name"`
  587. JobID int64 `json:"job_id"`
  588. Status int `json:"status"`
  589. CreateTime int64 `json:"create_time"`
  590. VersionID int64 `json:"version_id"`
  591. ResourceID string `json:"resource_id"`
  592. VersionName string `json:"version_name"`
  593. }
  594. type CreateTrainJobConfigResult struct {
  595. ErrorCode string `json:"error_code"`
  596. ErrorMsg string `json:"error_msg"`
  597. IsSuccess bool `json:"is_success"`
  598. }
  599. type GetResourceSpecsResult struct {
  600. ErrorCode string `json:"error_code"`
  601. ErrorMsg string `json:"error_msg"`
  602. IsSuccess bool `json:"is_success"`
  603. SpecTotalCount int `json:"spec_total_count"`
  604. Specs []Specs `json:"specs"`
  605. }
  606. type Specs struct {
  607. Core string `json:"core"`
  608. Cpu string `json:"cpu"`
  609. IsNoResource bool `json:"no_resource"`
  610. GpuType string `json:"gpu_type"`
  611. SpecID int64 `json:"spec_id"`
  612. GpuNum int `json:"gpu_num"`
  613. SpecCode string `json:"spec_code"`
  614. Storage string `json:"storage"`
  615. MaxNum int `json:"max_num"`
  616. UnitNum int `json:"unit_num"`
  617. InterfaceType int `json:"interface_type"`
  618. }
  619. type GetConfigListResult struct {
  620. ErrorCode string `json:"error_code"`
  621. ErrorMsg string `json:"error_msg"`
  622. IsSuccess bool `json:"is_success"`
  623. ConfigTotalCount int `json:"config_total_count"`
  624. ParaConfigs []ParaConfig `json:"configs"`
  625. }
  626. type ParaConfig struct {
  627. ConfigName string `json:"config_name"`
  628. ConfigDesc string `json:"config_desc"`
  629. CreateTime int64 `json:"create_time"`
  630. EngineType int `json:"engine_type"`
  631. EngineName string `json:"engine_name"`
  632. EngineId int64 `json:"engine_id"`
  633. EngineVersion string `json:"engine_version"`
  634. UserImageUrl string `json:"user_image_url"`
  635. UserCommand string `json:"user_command"`
  636. Result GetConfigResult
  637. }
  638. type GetConfigResult struct {
  639. ErrorCode string `json:"error_code"`
  640. ErrorMsg string `json:"error_msg"`
  641. IsSuccess bool `json:"is_success"`
  642. ConfigName string `json:"config_name"`
  643. Description string `json:"config_desc"`
  644. WorkServerNum int `json:"worker_server_num"`
  645. AppUrl string `json:"app_url"` //训练作业的代码目录
  646. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  647. Parameter []Parameter `json:"parameter"`
  648. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  649. //DatasetID string `json:"dataset_id"`
  650. //DataVersionID string `json:"dataset_version_id"`
  651. //DataSource []DataSource `json:"data_source"`
  652. //SpecID int64 `json:"spec_id"`
  653. EngineID int64 `json:"engine_id"`
  654. //ModelID int64 `json:"model_id"`
  655. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  656. LogUrl string `json:"log_url"`
  657. //UserImageUrl string `json:"user_image_url"`
  658. //UserCommand string `json:"user_command"`
  659. //CreateVersion bool `json:"create_version"`
  660. //Volumes []Volumes `json:"volumes"`
  661. Flavor Flavor `json:"flavor"`
  662. PoolID string `json:"pool_id"`
  663. }
  664. type ErrorResult struct {
  665. ErrorCode string `json:"error_code"`
  666. ErrorMsg string `json:"error_message"`
  667. IsSuccess bool `json:"is_success"`
  668. }
  669. type GetTrainJobResult struct {
  670. IsSuccess bool `json:"is_success"`
  671. JobName string `json:"job_name"`
  672. JobID int64 `json:"job_id"`
  673. Description string `json:"job_desc"`
  674. IntStatus int `json:"status"`
  675. Status string
  676. LongCreateTime int64 `json:"create_time"`
  677. CreateTime string
  678. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  679. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  680. VersionID int64 `json:"version_id"`
  681. ResourceID string `json:"resource_id"`
  682. VersionName string `json:"version_name"`
  683. PreVersionID int64 `json:"pre_version_id"`
  684. WorkServerNum int `json:"worker_server_num"`
  685. AppUrl string `json:"app_url"` //训练作业的代码目录
  686. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  687. Parameter []Parameter `json:"parameter"`
  688. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  689. //DatasetID string `json:"dataset_id"`
  690. //DataVersionID string `json:"dataset_version_id"`
  691. //DataSource []DataSource `json:"data_source"`
  692. //SpecID int64 `json:"spec_id"`
  693. EngineID int64 `json:"engine_id"`
  694. EngineName string `json:"engine_name"`
  695. EngineVersion string `json:"engine_version"`
  696. //ModelID int64 `json:"model_id"`
  697. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  698. LogUrl string `json:"log_url"`
  699. //UserImageUrl string `json:"user_image_url"`
  700. //UserCommand string `json:"user_command"`
  701. //Volumes []Volumes `json:"volumes"`
  702. Flavor Flavor `json:"flavor"`
  703. PoolID string `json:"pool_id"`
  704. PoolName string `json:"pool_name"`
  705. NasMountPath string `json:"nas_mount_path"`
  706. NasShareAddr string `json:"nas_share_addr"`
  707. DatasetName string
  708. }
  709. type GetTrainJobLogResult struct {
  710. ErrorCode string `json:"error_code"`
  711. ErrorMsg string `json:"error_msg"`
  712. IsSuccess bool `json:"is_success"`
  713. Content string `json:"content"`
  714. Lines int `json:"lines"`
  715. StartLine string `json:"start_line"`
  716. EndLine string `json:"end_line"`
  717. }
  718. type GetTrainJobLogFileNamesResult struct {
  719. ErrorCode string `json:"error_code"`
  720. ErrorMsg string `json:"error_msg"`
  721. IsSuccess bool `json:"is_success"`
  722. LogFileList []string `json:"log_file_list"`
  723. }
  724. type TrainJobResult struct {
  725. ErrorCode string `json:"error_code"`
  726. ErrorMsg string `json:"error_msg"`
  727. IsSuccess bool `json:"is_success"`
  728. }
  729. type LogFile struct {
  730. Name string
  731. }
  732. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  733. sess := x.NewSession()
  734. defer sess.Close()
  735. var cond = builder.NewCond()
  736. if opts.RepoID > 0 {
  737. cond = cond.And(
  738. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  739. )
  740. }
  741. if opts.UserID > 0 {
  742. cond = cond.And(
  743. builder.Eq{"cloudbrain.user_id": opts.UserID},
  744. )
  745. }
  746. if (opts.JobID) > 0 {
  747. cond = cond.And(
  748. builder.Eq{"cloudbrain.job_id": opts.JobID},
  749. )
  750. }
  751. if (opts.Type) >= 0 {
  752. cond = cond.And(
  753. builder.Eq{"cloudbrain.type": opts.Type},
  754. )
  755. }
  756. // switch opts.JobStatus {
  757. // case JobWaiting:
  758. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  759. // case JobFailed:
  760. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  761. // case JobStopped:
  762. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  763. // case JobSucceeded:
  764. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  765. // }
  766. if len(opts.CloudbrainIDs) > 0 {
  767. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  768. }
  769. count, err := sess.Where(cond).Count(new(Cloudbrain))
  770. if err != nil {
  771. return nil, 0, fmt.Errorf("Count: %v", err)
  772. }
  773. if opts.Page >= 0 && opts.PageSize > 0 {
  774. var start int
  775. if opts.Page == 0 {
  776. start = 0
  777. } else {
  778. start = (opts.Page - 1) * opts.PageSize
  779. }
  780. sess.Limit(opts.PageSize, start)
  781. }
  782. sess.OrderBy("cloudbrain.created_unix DESC")
  783. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  784. if err := sess.Table(&Cloudbrain{}).Where(cond).
  785. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  786. Find(&cloudbrains); err != nil {
  787. return nil, 0, fmt.Errorf("Find: %v", err)
  788. }
  789. sess.Close()
  790. return cloudbrains, count, nil
  791. }
  792. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  793. if _, err = x.Insert(cloudbrain); err != nil {
  794. return err
  795. }
  796. return nil
  797. }
  798. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  799. has, err := x.Get(cb)
  800. if err != nil {
  801. return nil, err
  802. } else if !has {
  803. return nil, ErrJobNotExist{}
  804. }
  805. return cb, nil
  806. }
  807. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  808. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  809. return getRepoCloudBrain(cb)
  810. }
  811. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  812. cb := &Cloudbrain{JobID: jobID}
  813. return getRepoCloudBrain(cb)
  814. }
  815. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  816. cloudBrains := make([]*Cloudbrain, 0)
  817. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  818. return cloudBrains, err
  819. }
  820. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  821. cloudBrains := make([]*Cloudbrain, 0)
  822. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  823. return cloudBrains, err
  824. }
  825. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  826. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  827. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  828. return
  829. }
  830. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  831. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  832. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  833. return
  834. }
  835. func UpdateJob(job *Cloudbrain) error {
  836. return updateJob(x, job)
  837. }
  838. func updateJob(e Engine, job *Cloudbrain) error {
  839. var sess *xorm.Session
  840. sess = e.Where("job_id = ?", job.JobID)
  841. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  842. return err
  843. }
  844. // func UpdateTrainJob(job *CloudbrainInfo) error {
  845. // return updateTrainJob(x, job)
  846. // }
  847. // func updateTrainJob(e Engine, job *CloudbrainInfo) error {
  848. // var sess *xorm.Session
  849. // sess = e.Where("job_id = ?", job.Cloudbrain.JobID)
  850. // _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  851. // return err
  852. // }
  853. func DeleteJob(job *Cloudbrain) error {
  854. return deleteJob(x, job)
  855. }
  856. func deleteJob(e Engine, job *Cloudbrain) error {
  857. _, err := e.ID(job.ID).Delete(job)
  858. return err
  859. }
  860. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  861. cb := &Cloudbrain{JobName: jobName}
  862. return getRepoCloudBrain(cb)
  863. }
  864. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  865. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  866. return false
  867. }
  868. repo, err := GetRepositoryByID(job.RepoID)
  869. if err != nil {
  870. log.Error("GetRepositoryByID failed:%v", err.Error())
  871. return false
  872. }
  873. permission, _ := GetUserRepoPermission(repo, user)
  874. if err != nil {
  875. log.Error("GetUserRepoPermission failed:%v", err.Error())
  876. return false
  877. }
  878. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  879. return true
  880. }
  881. return false
  882. }