From 9b65739eada4c0fea1d08f38d3ca5b3457cf20d1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 23 Nov 2022 16:08:23 +0100 Subject: [PATCH 001/225] Start creating a `dora-daemon` --- Cargo.lock | 69 ++++++++++++++++- binaries/daemon/Cargo.toml | 17 ++++ binaries/daemon/src/main.rs | 150 ++++++++++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 binaries/daemon/Cargo.toml create mode 100644 binaries/daemon/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index e3b204c1..692ce19c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,6 +287,18 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "block-buffer" version = "0.7.3" @@ -952,6 +964,21 @@ dependencies = [ "zenoh-config", ] +[[package]] +name = "dora-daemon" +version = "0.1.0" +dependencies = [ + "eyre", + "futures-concurrency 7.0.0", + "serde", + "serde_json", + "shared_memory", + "tokio", + "tokio-stream", + "tracing", + "tracing-subscriber", +] + [[package]] name = "dora-download" version = "0.1.0" @@ -1282,6 +1309,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + [[package]] name = "futures" version = "0.3.25" @@ -1328,6 +1361,17 @@ dependencies = [ "pin-project", ] +[[package]] +name = "futures-concurrency" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a740c32e1bde284ce2f51df98abd4fa38e9e539670443c111211777e3ab09927" +dependencies = [ + "bitvec", + "futures-core", + "pin-project", +] + [[package]] name = "futures-core" version = "0.3.25" @@ -3046,6 +3090,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + [[package]] name = "rand" version = "0.8.5" @@ -3743,6 +3793,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "target-lexicon" version = "0.12.4" @@ -3916,9 +3972,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df54d54117d6fdc4e4fea40fe1e4e566b3505700e148a6827e59b34b0d2600d9" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -4666,6 +4722,15 @@ dependencies = [ "syn", ] +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml new file mode 100644 index 00000000..a730bde0 --- /dev/null +++ b/binaries/daemon/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "dora-daemon" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +eyre = "0.6.8" +tokio = { version = "1.20.1", features = ["full"] } +tokio-stream = { version = "0.1.11", features = ["net"] } +tracing = "0.1.36" +tracing-subscriber = "0.3.15" +futures-concurrency = "7.0.0" +serde = { version = "1.0.136", features = ["derive"] } +serde_json = "1.0.86" +shared_memory = "0.12.0" diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs new file mode 100644 index 00000000..d71f0e34 --- /dev/null +++ b/binaries/daemon/src/main.rs @@ -0,0 +1,150 @@ +use eyre::{eyre, Context}; +use futures_concurrency::stream::Merge; +use shared_memory::ShmemConf; +use std::{collections::HashMap, io::ErrorKind, net::Ipv4Addr}; +use tokio::{ + io::{AsyncReadExt, AsyncWriteExt}, + net::{TcpListener, TcpStream}, + sync::mpsc, +}; +use tokio_stream::{ + wrappers::{ReceiverStream, TcpListenerStream}, + StreamExt, +}; + +const PORT: u16 = 0xD02A; + +#[tokio::main] +async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let socket = match TcpListener::bind((localhost, PORT)).await { + Ok(socket) => socket, + Err(err) if err.kind() == ErrorKind::AddrInUse => { + eyre::bail!("port {PORT} is already in use. Is `dora-daemon` already running?"); + } + Err(err) => { + return Err(eyre::Report::new(err).wrap_err(format!("failed to listen on port {PORT}"))) + } + }; + + // TODO: set up connection to coordinator + + let new_connections = TcpListenerStream::new(socket).map(|c| { + c.map(Event::NewConnection) + .wrap_err("failed to open connection") + .unwrap_or_else(Event::ConnectError) + }); + let (node_events_tx, node_events_rx) = mpsc::channel(10); + let node_events = ReceiverStream::new(node_events_rx).map(Event::Node); + + let mut events = (new_connections, node_events).merge(); + + let mut uninit_shared_memory = HashMap::new(); + let mut sent_out_shared_memory = HashMap::new(); + + while let Some(event) = events.next().await { + match event { + Event::NewConnection(mut connection) => { + let events_tx = node_events_tx.clone(); + tokio::spawn(async move { + loop { + let raw = match tcp_receive(&mut connection).await { + Ok(data) => data, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + continue; + } + }; + let event = match serde_json::from_slice(&raw) + .wrap_err("failed to deserialize node message") + { + Ok(e) => e, + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + let Ok(()) = events_tx.send(event).await else { + break; + }; + } + }); + } + Event::ConnectError(err) => { + tracing::warn!("{:?}", err.wrap_err("failed to connect")); + } + Event::Node(event) => match event { + NodeEvent::PrepareOutputMessage { len } => { + let memory = ShmemConf::new() + .size(len) + .create() + .wrap_err("failed to allocate shared memory")?; + let id = memory.get_os_id().to_owned(); + uninit_shared_memory.insert(id, memory); + + // TODO send reply with id + } + NodeEvent::SendOutMessage { id } => { + let memory = uninit_shared_memory + .remove(&id) + .ok_or_else(|| eyre!("invalid shared memory id"))?; + + // TODO send shared memory ID to all local receivers + + let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + // TODO send `data` via network to all remove receivers + + sent_out_shared_memory.insert(id, memory); + } + }, + } + } + + Ok(()) +} + +enum Event { + NewConnection(TcpStream), + ConnectError(eyre::Report), + Node(NodeEvent), +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +enum NodeEvent { + PrepareOutputMessage { len: usize }, + SendOutMessage { id: MessageId }, +} + +type MessageId = String; + +async fn tcp_send(connection: &mut TcpStream, request: &[u8]) -> std::io::Result<()> { + let len_raw = (request.len() as u64).to_le_bytes(); + connection.write_all(&len_raw).await?; + connection.write_all(request).await?; + Ok(()) +} + +async fn tcp_receive(connection: &mut TcpStream) -> std::io::Result> { + let reply_len = { + let mut raw = [0; 8]; + connection.read_exact(&mut raw).await?; + u64::from_le_bytes(raw) as usize + }; + let mut reply = vec![0; reply_len]; + connection.read_exact(&mut reply).await?; + Ok(reply) +} + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer().pretty(); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} From f7c6cca9438fd6492f084c80e0e47dd416f02c7a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Nov 2022 11:48:40 +0100 Subject: [PATCH 002/225] Move port number constant to `dora-core` --- Cargo.lock | 1 + binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/main.rs | 14 +++++++++----- libraries/core/src/topics.rs | 2 ++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 692ce19c..42449c16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -968,6 +968,7 @@ dependencies = [ name = "dora-daemon" version = "0.1.0" dependencies = [ + "dora-core", "eyre", "futures-concurrency 7.0.0", "serde", diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index a730bde0..6512ce92 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -15,3 +15,4 @@ futures-concurrency = "7.0.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.86" shared_memory = "0.12.0" +dora-core = { path = "../../libraries/core" } diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index d71f0e34..546c6988 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,3 +1,4 @@ +use dora_core::topics::DORA_DAEMON_PORT_DEFAULT; use eyre::{eyre, Context}; use futures_concurrency::stream::Merge; use shared_memory::ShmemConf; @@ -12,20 +13,23 @@ use tokio_stream::{ StreamExt, }; -const PORT: u16 = 0xD02A; - #[tokio::main] async fn main() -> eyre::Result<()> { set_up_tracing().wrap_err("failed to set up tracing subscriber")?; let localhost = Ipv4Addr::new(127, 0, 0, 1); - let socket = match TcpListener::bind((localhost, PORT)).await { + let socket = match TcpListener::bind((localhost, DORA_DAEMON_PORT_DEFAULT)).await { Ok(socket) => socket, Err(err) if err.kind() == ErrorKind::AddrInUse => { - eyre::bail!("port {PORT} is already in use. Is `dora-daemon` already running?"); + eyre::bail!( + "port {DORA_DAEMON_PORT_DEFAULT} is already in use. \ + Is `dora-daemon` already running?" + ); } Err(err) => { - return Err(eyre::Report::new(err).wrap_err(format!("failed to listen on port {PORT}"))) + return Err(eyre::Report::new(err).wrap_err(format!( + "failed to listen on port {DORA_DAEMON_PORT_DEFAULT}" + ))) } }; diff --git a/libraries/core/src/topics.rs b/libraries/core/src/topics.rs index d7181885..ca05fb41 100644 --- a/libraries/core/src/topics.rs +++ b/libraries/core/src/topics.rs @@ -5,6 +5,8 @@ use std::{ }; use uuid::Uuid; +pub const DORA_DAEMON_PORT_DEFAULT: u16 = 0xD02A; + pub const MANUAL_STOP: &str = "dora/stop"; pub fn control_socket_addr() -> SocketAddr { From 7d85b92b2a4b3986d33a93e681d46a3d85d79018 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Nov 2022 12:57:06 +0100 Subject: [PATCH 003/225] Connect to dora-daemon from node API and register with node ID --- Cargo.lock | 5 ++- apis/rust/node/Cargo.toml | 1 + apis/rust/node/src/daemon.rs | 61 +++++++++++++++++++++++++++ apis/rust/node/src/lib.rs | 5 +++ binaries/daemon/src/main.rs | 60 +++++++++++++++++++++++--- libraries/core/src/daemon_messages.rs | 15 +++++++ libraries/core/src/lib.rs | 1 + 7 files changed, 139 insertions(+), 9 deletions(-) create mode 100644 apis/rust/node/src/daemon.rs create mode 100644 libraries/core/src/daemon_messages.rs diff --git a/Cargo.lock b/Cargo.lock index 42449c16..123f4fb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1033,6 +1033,7 @@ dependencies = [ "flume", "once_cell", "serde", + "serde_json", "serde_yaml 0.8.23", "thiserror", "tokio", @@ -3534,9 +3535,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.86" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074" +checksum = "020ff22c755c2ed3f8cf162dbb41a7268d934702f3ed3631656ea597e08fc3db" dependencies = [ "itoa", "ryu", diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index 4ae127d8..984f1b32 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -15,6 +15,7 @@ eyre = "0.6.7" once_cell = "1.13.0" serde = { version = "1.0.136", features = ["derive"] } serde_yaml = "0.8.23" +serde_json = "1.0.89" thiserror = "1.0.30" tracing = "0.1.33" tracing-subscriber = { version = "0.3.15", optional = true } diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs new file mode 100644 index 00000000..91bdfab7 --- /dev/null +++ b/apis/rust/node/src/daemon.rs @@ -0,0 +1,61 @@ +use std::{ + io::{Read, Write}, + net::{Ipv4Addr, TcpStream}, +}; + +use dora_core::{config::NodeId, topics::DORA_DAEMON_PORT_DEFAULT}; +use eyre::{eyre, Context}; + +pub struct DaemonConnection { + stream: TcpStream, +} + +impl DaemonConnection { + pub fn init(node_id: NodeId) -> eyre::Result { + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let mut stream = TcpStream::connect((localhost, DORA_DAEMON_PORT_DEFAULT)) + .wrap_err("failed to connect to dora-daemon")?; + + tcp_send( + &mut stream, + &dora_core::daemon_messages::Request::Register { node_id }, + ) + .wrap_err("failed to send register request to dora-daemon")?; + + match tcp_receive(&mut stream) + .wrap_err("failed to receive register reply from dora-daemon")? + { + dora_core::daemon_messages::Reply::RegisterResult(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to register node with dora-daemon")?, + } + + Ok(Self { stream }) + } +} + +fn tcp_send( + connection: &mut TcpStream, + request: &dora_core::daemon_messages::Request, +) -> std::io::Result<()> { + let serialized = serde_json::to_vec(request)?; + + let len_raw = (serialized.len() as u64).to_le_bytes(); + connection.write_all(&len_raw)?; + connection.write_all(&serialized)?; + Ok(()) +} + +fn tcp_receive(connection: &mut TcpStream) -> std::io::Result { + let reply_len = { + let mut raw = [0; 8]; + connection.read_exact(&mut raw)?; + u64::from_le_bytes(raw) as usize + }; + let mut reply_raw = vec![0; reply_len]; + connection.read_exact(&mut reply_raw)?; + + let reply = serde_json::from_slice(&reply_raw)?; + + Ok(reply) +} diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 53e35701..7557d10a 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,6 +1,7 @@ pub use communication::Input; use communication::STOP_TOPIC; use communication_layer_pub_sub::CommunicationLayer; +use daemon::DaemonConnection; pub use dora_core; use dora_core::config::{CommunicationConfig, DataId, NodeId, NodeRunConfig}; pub use dora_message::{uhlc, Metadata, MetadataParameters}; @@ -8,6 +9,7 @@ use eyre::WrapErr; pub use flume::Receiver; pub mod communication; +pub mod daemon; pub struct DoraNode { id: NodeId, @@ -44,6 +46,9 @@ impl DoraNode { node_config: NodeRunConfig, communication_config: CommunicationConfig, ) -> eyre::Result { + let daemon = + DaemonConnection::init(id.clone()).wrap_err("failed to connect to dora-daemon")?; + let communication = communication::init(&communication_config)?; Ok(Self { id, diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 546c6988..276b311b 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,4 +1,4 @@ -use dora_core::topics::DORA_DAEMON_PORT_DEFAULT; +use dora_core::{config::NodeId, daemon_messages, topics::DORA_DAEMON_PORT_DEFAULT}; use eyre::{eyre, Context}; use futures_concurrency::stream::Merge; use shared_memory::ShmemConf; @@ -41,7 +41,7 @@ async fn main() -> eyre::Result<()> { .unwrap_or_else(Event::ConnectError) }); let (node_events_tx, node_events_rx) = mpsc::channel(10); - let node_events = ReceiverStream::new(node_events_rx).map(Event::Node); + let node_events = ReceiverStream::new(node_events_rx); let mut events = (new_connections, node_events).merge(); @@ -52,6 +52,7 @@ async fn main() -> eyre::Result<()> { match event { Event::NewConnection(mut connection) => { let events_tx = node_events_tx.clone(); + let mut id = None; tokio::spawn(async move { loop { let raw = match tcp_receive(&mut connection).await { @@ -64,7 +65,7 @@ async fn main() -> eyre::Result<()> { continue; } }; - let event = match serde_json::from_slice(&raw) + let message: daemon_messages::Request = match serde_json::from_slice(&raw) .wrap_err("failed to deserialize node message") { Ok(e) => e, @@ -73,6 +74,51 @@ async fn main() -> eyre::Result<()> { continue; } }; + + let node_event = match message { + daemon_messages::Request::Register { node_id } => { + id = Some(node_id); + + let reply = daemon_messages::Reply::RegisterResult(Ok(())); + let serialized = serde_json::to_vec(&reply) + .wrap_err("failed to serialize register result"); + + let send_result = match serialized { + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + Ok(m) => tcp_send(&mut connection, &m).await, + }; + + match send_result { + Ok(()) => continue, + Err(err) => { + tracing::warn!("{err:?}"); + break; // close connection + } + } + } + daemon_messages::Request::PrepareOutputMessage { len } => { + NodeEvent::PrepareOutputMessage { len } + } + daemon_messages::Request::SendOutMessage { id } => { + NodeEvent::SendOutMessage { id } + } + }; + let event = Event::Node { + id: match &id { + Some(id) => id.clone(), + None => { + tracing::warn!( + "Ignoring node event because no register \ + message was sent yet: {node_event:?}" + ); + continue; + } + }, + event: node_event, + }; let Ok(()) = events_tx.send(event).await else { break; }; @@ -82,7 +128,7 @@ async fn main() -> eyre::Result<()> { Event::ConnectError(err) => { tracing::warn!("{:?}", err.wrap_err("failed to connect")); } - Event::Node(event) => match event { + Event::Node { id, event } => match event { NodeEvent::PrepareOutputMessage { len } => { let memory = ShmemConf::new() .size(len) @@ -115,11 +161,11 @@ async fn main() -> eyre::Result<()> { enum Event { NewConnection(TcpStream), ConnectError(eyre::Report), - Node(NodeEvent), + Node { id: NodeId, event: NodeEvent }, } -#[derive(Debug, serde::Serialize, serde::Deserialize)] -enum NodeEvent { +#[derive(Debug)] +pub enum NodeEvent { PrepareOutputMessage { len: usize }, SendOutMessage { id: MessageId }, } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs new file mode 100644 index 00000000..ab0e6b02 --- /dev/null +++ b/libraries/core/src/daemon_messages.rs @@ -0,0 +1,15 @@ +use crate::config::NodeId; + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub enum Request { + Register { node_id: NodeId }, + PrepareOutputMessage { len: usize }, + SendOutMessage { id: MessageId }, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub enum Reply { + RegisterResult(Result<(), String>), +} + +type MessageId = String; diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index 997a3204..b4b20cc0 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -5,6 +5,7 @@ use std::{ }; pub mod config; +pub mod daemon_messages; pub mod descriptor; pub mod topics; From c2e4948dd51710bea497e9af9ee2fc3037ff3b4b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 30 Nov 2022 15:16:48 +0100 Subject: [PATCH 004/225] Update Rust node API to communicate through dora-daemon --- Cargo.lock | 1 + apis/rust/node/src/communication.rs | 249 -------------------------- apis/rust/node/src/daemon.rs | 164 ++++++++++++++--- apis/rust/node/src/lib.rs | 84 +++------ binaries/daemon/src/main.rs | 27 +-- libraries/core/Cargo.toml | 1 + libraries/core/src/daemon_messages.rs | 48 ++++- 7 files changed, 229 insertions(+), 345 deletions(-) delete mode 100644 apis/rust/node/src/communication.rs diff --git a/Cargo.lock b/Cargo.lock index 123f4fb5..6ab40956 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -955,6 +955,7 @@ dependencies = [ name = "dora-core" version = "0.1.0" dependencies = [ + "dora-message", "eyre", "once_cell", "serde", diff --git a/apis/rust/node/src/communication.rs b/apis/rust/node/src/communication.rs deleted file mode 100644 index ffd908b5..00000000 --- a/apis/rust/node/src/communication.rs +++ /dev/null @@ -1,249 +0,0 @@ -use crate::BoxError; -use communication_layer_pub_sub::ReceivedSample; -pub use communication_layer_pub_sub::{CommunicationLayer, Publisher, Subscriber}; -use dora_core::{ - config::{CommunicationConfig, DataId, InputMapping, NodeId, OperatorId}, - topics, -}; -use dora_message::Metadata; -use eyre::Context; -use std::{ - borrow::Cow, - collections::{BTreeMap, HashSet}, - ops::Deref, - sync::Arc, - thread, -}; - -#[doc(hidden)] -pub const STOP_TOPIC: &str = "__dora_rs_internal__operator_stopped"; - -pub fn init( - communication_config: &CommunicationConfig, -) -> eyre::Result> { - match communication_config { - #[cfg(feature = "zenoh")] - CommunicationConfig::Zenoh { - config: zenoh_config, - prefix: zenoh_prefix, - } => { - let layer = communication_layer_pub_sub::zenoh::ZenohCommunicationLayer::init( - zenoh_config.deref().clone(), - zenoh_prefix.clone(), - ) - .map_err(|err| eyre::eyre!(err))?; - - Ok(Box::new(layer)) - } - #[cfg(not(feature = "zenoh"))] - CommunicationConfig::Zenoh { .. } => { - eyre::bail!( - "cannot parse zenoh config because the compile-time `zenoh` feature \ - of `dora-node-api` was disabled" - ) - } - #[cfg(all(unix, feature = "iceoryx"))] - CommunicationConfig::Iceoryx { - app_name_prefix, - topic_prefix, - } => { - let app_name_prefix = app_name_prefix.clone(); - let app_name = format!("{app_name_prefix}-{}", uuid::Uuid::new_v4()); - let instance_name = topic_prefix.clone(); - let layer = communication_layer_pub_sub::iceoryx::IceoryxCommunicationLayer::init( - app_name, - "dora".into(), - instance_name, - ) - .map_err(|err| eyre::eyre!(err))?; - - Ok(Box::new(layer)) - } - #[cfg(not(all(unix, feature = "iceoryx")))] - CommunicationConfig::Iceoryx { .. } => { - eyre::bail!( - "cannot parse iceoryx config because the compile-time `iceoryx` feature \ - of `dora-node-api` was disabled" - ) - } - } -} - -pub fn subscribe_all( - communication: &mut dyn CommunicationLayer, - inputs: &BTreeMap, -) -> eyre::Result> { - let (inputs_tx, inputs_rx) = flume::bounded(10); - let inputs_tx = Arc::new(inputs_tx); - for (input, mapping) in inputs { - let topic = mapping.to_string(); - let mut sub = communication - .subscribe(&topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed to subscribe on {topic}"))?; - - let input_id = input.to_owned(); - let sender = inputs_tx.clone(); - thread::spawn(move || loop { - let event = match sub.recv().transpose() { - None => break, - Some(Ok(sample)) => { - let mut raw: &[u8] = &sample.get(); - let full_len = raw.len(); - match Metadata::deserialize(&mut raw).with_context(|| { - format!("failed to deserialize metadata for `{input_id}` message") - }) { - Ok(metadata) => InputEvent::Input(Input { - id: input_id.clone(), - metadata, - data: Data { - offset: full_len - raw.len(), - sample, - }, - }), - Err(err) => InputEvent::ParseMessageError(err), - } - } - Some(Err(err)) => InputEvent::Error(err), - }; - match sender.send(event) { - Ok(()) => {} - Err(flume::SendError(_)) => break, - } - }); - } - - let mut sources: HashSet<_> = inputs - .values() - .map(|v| (v.source().to_owned(), v.operator().to_owned())) - .collect(); - for (source, operator) in &sources { - let topic = match operator { - Some(operator) => format!("{source}/{operator}/{STOP_TOPIC}"), - None => format!("{source}/{STOP_TOPIC}"), - }; - let mut sub = communication - .subscribe(&topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed to subscribe on {topic}"))?; - - let source = source.to_owned(); - let operator = operator.clone(); - let sender = inputs_tx.clone(); - thread::spawn(move || loop { - let event = match sub.recv().transpose() { - None => break, - Some(Ok(_)) => InputEvent::SourceClosed { - source: source.clone(), - operator: operator.clone(), - }, - Some(Err(err)) => InputEvent::Error(err), - }; - match sender.send(event) { - Ok(()) => {} - Err(flume::SendError(_)) => break, - } - }); - } - - // subscribe to topic for manual stops - { - let topic = topics::MANUAL_STOP; - let mut sub = communication - .subscribe(topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed to subscribe on {topic}"))?; - - // only keep a weak reference to the sender because we don't want to - // prevent it from being closed (e.g. when all sources are closed) - let sender = Arc::downgrade(&inputs_tx); - std::mem::drop(inputs_tx); - - thread::spawn(move || loop { - let event = match sub.recv().transpose() { - None => break, - Some(Ok(_)) => InputEvent::ManualStop, - Some(Err(err)) => InputEvent::Error(err), - }; - match sender.upgrade() { - Some(sender) => match sender.send(event) { - Ok(()) => {} - Err(flume::SendError(_)) => break, - }, - None => break, - } - }); - } - - let (combined_tx, combined) = flume::bounded(1); - thread::spawn(move || loop { - match inputs_rx.recv() { - Ok(InputEvent::Input(message)) => match combined_tx.send(message) { - Ok(()) => {} - Err(flume::SendError(_)) => break, - }, - Ok(InputEvent::SourceClosed { source, operator }) => { - sources.remove(&(source, operator)); - if sources.is_empty() { - break; - } - } - Ok(InputEvent::ManualStop) => { - tracing::info!("received manual stop message"); - break; - } - Ok(InputEvent::ParseMessageError(err)) => { - tracing::warn!("{err:?}"); - } - Ok(InputEvent::Error(err)) => panic!("{err}"), - Err(_) => break, - } - }); - - Ok(combined) -} - -enum InputEvent { - Input(Input), - SourceClosed { - source: NodeId, - operator: Option, - }, - ManualStop, - Error(BoxError), - ParseMessageError(eyre::Report), -} - -pub struct Input { - pub id: DataId, - pub metadata: Metadata<'static>, - pub data: Data, -} - -impl Input { - pub fn data(&self) -> Cow<[u8]> { - self.data.get() - } - - pub fn metadata(&self) -> &Metadata { - &self.metadata - } -} - -pub struct Data { - sample: Box, - offset: usize, -} - -impl Data { - fn get(&self) -> Cow<[u8]> { - match self.sample.get() { - std::borrow::Cow::Borrowed(data) => Cow::Borrowed(&data[self.offset..]), - std::borrow::Cow::Owned(mut data) => { - // TODO avoid copy caused by moving the remaining elements to the front - data.drain(..self.offset); - Cow::Owned(data) - } - } - } -} diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 91bdfab7..f304659b 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,43 +1,164 @@ use std::{ - io::{Read, Write}, + io::{ErrorKind, Read, Write}, net::{Ipv4Addr, TcpStream}, }; -use dora_core::{config::NodeId, topics::DORA_DAEMON_PORT_DEFAULT}; -use eyre::{eyre, Context}; +use dora_core::{ + config::{DataId, NodeId}, + daemon_messages::{ControlRequest, NodeEvent, RawMutInput}, + topics::DORA_DAEMON_PORT_DEFAULT, +}; +use eyre::{bail, eyre, Context}; + +pub type EventStream = flume::Receiver; pub struct DaemonConnection { - stream: TcpStream, + pub control_channel: ControlChannel, + pub event_stream: EventStream, } impl DaemonConnection { pub fn init(node_id: NodeId) -> eyre::Result { let localhost = Ipv4Addr::new(127, 0, 0, 1); - let mut stream = TcpStream::connect((localhost, DORA_DAEMON_PORT_DEFAULT)) - .wrap_err("failed to connect to dora-daemon")?; + let control_stream = + init_control_stream(localhost, &node_id).wrap_err("failed to init control stream")?; + + let event_stream = + init_event_stream(localhost, &node_id).wrap_err("failed to init event stream")?; + + Ok(Self { + control_channel: ControlChannel(control_stream), + event_stream, + }) + } +} + +pub struct ControlChannel(TcpStream); +impl ControlChannel { + pub fn report_stop(&mut self) -> eyre::Result<()> { + tcp_send(&mut self.0, &ControlRequest::Stopped) + .wrap_err("failed to send subscribe request to dora-daemon")?; + match tcp_receive(&mut self.0) + .wrap_err("failed to receive subscribe reply from dora-daemon")? + { + dora_core::daemon_messages::ControlReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to report stop event to dora-daemon")?, + other => bail!("unexpected stopped reply: {other:?}"), + } + Ok(()) + } + + pub fn prepare_message( + &mut self, + output_id: DataId, + len: usize, + ) -> eyre::Result { tcp_send( - &mut stream, - &dora_core::daemon_messages::Request::Register { node_id }, + &mut self.0, + &ControlRequest::PrepareOutputMessage { output_id, len }, ) - .wrap_err("failed to send register request to dora-daemon")?; + .wrap_err("failed to send PrepareOutputMessage request to dora-daemon")?; + match tcp_receive(&mut self.0) + .wrap_err("failed to receive PrepareOutputMessage reply from dora-daemon")? + { + dora_core::daemon_messages::ControlReply::PreparedMessage { id, data } => { + Ok(MessageSample { id, data }) + } + dora_core::daemon_messages::ControlReply::Result(Err(err)) => { + Err(eyre!(err).wrap_err("failed to report stop event to dora-daemon")) + } + other => bail!("unexpected PrepareOutputMessage reply: {other:?}"), + } + } - match tcp_receive(&mut stream) - .wrap_err("failed to receive register reply from dora-daemon")? + pub fn send_message(&mut self, sample: MessageSample) -> eyre::Result<()> { + tcp_send( + &mut self.0, + &ControlRequest::SendOutMessage { id: sample.id }, + ) + .wrap_err("failed to send SendOutMessage request to dora-daemon")?; + match tcp_receive(&mut self.0) + .wrap_err("failed to receive SendOutMessage reply from dora-daemon")? { - dora_core::daemon_messages::Reply::RegisterResult(result) => result - .map_err(|e| eyre!(e)) - .wrap_err("failed to register node with dora-daemon")?, + dora_core::daemon_messages::ControlReply::Result(result) => { + result.map_err(|err| eyre!(err)) + } + other => bail!("unexpected SendOutMessage reply: {other:?}"), + } + } +} + +pub struct MessageSample { + id: String, + pub data: RawMutInput, +} + +fn init_event_stream(addr: Ipv4Addr, node_id: &NodeId) -> eyre::Result { + let mut event_stream = TcpStream::connect((addr, DORA_DAEMON_PORT_DEFAULT)) + .wrap_err("failed to connect to dora-daemon")?; + tcp_send( + &mut event_stream, + &ControlRequest::Subscribe { + node_id: node_id.clone(), + }, + ) + .wrap_err("failed to send subscribe request to dora-daemon")?; + match tcp_receive(&mut event_stream) + .wrap_err("failed to receive subscribe reply from dora-daemon")? + { + dora_core::daemon_messages::ControlReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to create subscription with dora-daemon")?, + other => bail!("unexpected subscribe reply: {other:?}"), + } + + let (tx, rx) = flume::bounded(1); + std::thread::spawn(move || loop { + let event = match tcp_receive(&mut event_stream) { + Ok(event) => event, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => break, + Err(err) => { + let err = eyre!(err).wrap_err("failed to receive incoming event"); + tracing::warn!("{err:?}"); + continue; + } + }; + match tx.send(event) { + Ok(()) => {} + Err(_) => { + // receiving end of channel was closed + break; + } } + }); + + Ok(rx) +} - Ok(Self { stream }) +fn init_control_stream(addr: Ipv4Addr, node_id: &NodeId) -> eyre::Result { + let mut control_stream = TcpStream::connect((addr, DORA_DAEMON_PORT_DEFAULT)) + .wrap_err("failed to connect to dora-daemon")?; + tcp_send( + &mut control_stream, + &ControlRequest::Register { + node_id: node_id.clone(), + }, + ) + .wrap_err("failed to send register request to dora-daemon")?; + match tcp_receive(&mut control_stream) + .wrap_err("failed to receive register reply from dora-daemon")? + { + dora_core::daemon_messages::ControlReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to register node with dora-daemon")?, + other => bail!("unexpected register reply: {other:?}"), } + Ok(control_stream) } -fn tcp_send( - connection: &mut TcpStream, - request: &dora_core::daemon_messages::Request, -) -> std::io::Result<()> { +fn tcp_send(connection: &mut TcpStream, request: &T) -> std::io::Result<()> { let serialized = serde_json::to_vec(request)?; let len_raw = (serialized.len() as u64).to_le_bytes(); @@ -46,7 +167,10 @@ fn tcp_send( Ok(()) } -fn tcp_receive(connection: &mut TcpStream) -> std::io::Result { +fn tcp_receive(connection: &mut TcpStream) -> std::io::Result +where + T: for<'a> serde::Deserialize<'a>, +{ let reply_len = { let mut raw = [0; 8]; connection.read_exact(&mut raw)?; diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 7557d10a..a4e5ff4b 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,25 +1,22 @@ -pub use communication::Input; -use communication::STOP_TOPIC; use communication_layer_pub_sub::CommunicationLayer; -use daemon::DaemonConnection; +use daemon::{ControlChannel, DaemonConnection, EventStream}; pub use dora_core; use dora_core::config::{CommunicationConfig, DataId, NodeId, NodeRunConfig}; pub use dora_message::{uhlc, Metadata, MetadataParameters}; use eyre::WrapErr; pub use flume::Receiver; -pub mod communication; pub mod daemon; pub struct DoraNode { id: NodeId, node_config: NodeRunConfig, - communication: Box, + control_channel: ControlChannel, hlc: uhlc::HLC, } impl DoraNode { - pub fn init_from_env() -> eyre::Result { + pub fn init_from_env() -> eyre::Result<(Self, EventStream)> { #[cfg(feature = "tracing-subscriber")] set_up_tracing().context("failed to set up tracing subscriber")?; @@ -45,26 +42,24 @@ impl DoraNode { id: NodeId, node_config: NodeRunConfig, communication_config: CommunicationConfig, - ) -> eyre::Result { - let daemon = - DaemonConnection::init(id.clone()).wrap_err("failed to connect to dora-daemon")?; + ) -> eyre::Result<(Self, EventStream)> { + let DaemonConnection { + control_channel, + event_stream, + } = DaemonConnection::init(id.clone()).wrap_err("failed to connect to dora-daemon")?; - let communication = communication::init(&communication_config)?; - Ok(Self { + let node = Self { id, node_config, - communication, + control_channel, hlc: uhlc::HLC::default(), - }) - } - - pub fn inputs(&mut self) -> eyre::Result> { - communication::subscribe_all(self.communication.as_mut(), &self.node_config.inputs) + }; + Ok((node, event_stream)) } pub fn send_output( &mut self, - output_id: &DataId, + output_id: DataId, parameters: MetadataParameters, data_len: usize, data: F, @@ -72,7 +67,7 @@ impl DoraNode { where F: FnOnce(&mut [u8]), { - if !self.node_config.outputs.contains(output_id) { + if !self.node_config.outputs.contains(&output_id) { eyre::bail!("unknown output"); } let metadata = Metadata::from_parameters(self.hlc.new_timestamp(), parameters); @@ -81,23 +76,17 @@ impl DoraNode { .with_context(|| format!("failed to serialize `{}` message", output_id))?; let full_len = serialized_metadata.len() + data_len; - let self_id = &self.id; - let topic = format!("{self_id}/{output_id}"); - let publisher = self - .communication - .publisher(&topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed create publisher for output {output_id}"))?; - - let mut sample = publisher - .prepare(full_len) - .map_err(|err| eyre::eyre!(err))?; - let raw = sample.as_mut_slice(); + let sample = self + .control_channel + .prepare_message(output_id.clone(), full_len) + .wrap_err("failed to prepare sample for output message")?; + + let raw = sample.data.get_mut(); raw[..serialized_metadata.len()].copy_from_slice(&serialized_metadata); data(&mut raw[serialized_metadata.len()..]); - sample - .publish() - .map_err(|err| eyre::eyre!(err)) + + self.control_channel + .send_message(sample) .wrap_err_with(|| format!("failed to send data for output {output_id}"))?; Ok(()) } @@ -114,25 +103,8 @@ impl DoraNode { impl Drop for DoraNode { #[tracing::instrument(skip(self), fields(self.id = %self.id))] fn drop(&mut self) { - let self_id = &self.id; - let topic = format!("{self_id}/{STOP_TOPIC}"); - let result = self - .communication - .publisher(&topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| { - format!("failed to create publisher for stop message for node `{self_id}`") - }) - .and_then(|p| { - p.publish(&[]) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed to send stop message for node `{self_id}`")) - }); - match result { - Ok(()) => tracing::info!("sent stop message for {self_id}"), - Err(err) => { - tracing::error!("{err:?}") - } + if let Err(err) = self.control_channel.report_stop() { + tracing::error!("{err:?}"); } } } @@ -149,7 +121,6 @@ fn set_up_tracing() -> eyre::Result<()> { .context("failed to set tracing global subscriber") } -#[must_use] pub fn manual_stop_publisher( communication: &mut dyn CommunicationLayer, ) -> eyre::Result Result<(), BoxError>> { @@ -180,9 +151,8 @@ mod tests { prefix: format!("/{}", uuid::Uuid::new_v4()), }; - let mut node = DoraNode::init(id, node_config, communication_config).unwrap(); + let (_node, events) = DoraNode::init(id, node_config, communication_config).unwrap(); - let inputs = node.inputs().unwrap(); - assert!(inputs.recv().is_err()); + assert!(events.recv().is_err()); } } diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 276b311b..84a9fa98 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -65,21 +65,22 @@ async fn main() -> eyre::Result<()> { continue; } }; - let message: daemon_messages::Request = match serde_json::from_slice(&raw) - .wrap_err("failed to deserialize node message") - { - Ok(e) => e, - Err(err) => { - tracing::warn!("{err:?}"); - continue; - } - }; + let message: daemon_messages::ControlRequest = + match serde_json::from_slice(&raw) + .wrap_err("failed to deserialize node message") + { + Ok(e) => e, + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; let node_event = match message { - daemon_messages::Request::Register { node_id } => { + daemon_messages::ControlRequest::Register { node_id } => { id = Some(node_id); - let reply = daemon_messages::Reply::RegisterResult(Ok(())); + let reply = daemon_messages::ControlReply::Result(Ok(())); let serialized = serde_json::to_vec(&reply) .wrap_err("failed to serialize register result"); @@ -99,10 +100,10 @@ async fn main() -> eyre::Result<()> { } } } - daemon_messages::Request::PrepareOutputMessage { len } => { + daemon_messages::ControlRequest::PrepareOutputMessage { len } => { NodeEvent::PrepareOutputMessage { len } } - daemon_messages::Request::SendOutMessage { id } => { + daemon_messages::ControlRequest::SendOutMessage { id } => { NodeEvent::SendOutMessage { id } } }; diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 6abc3a34..87d0d50d 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -14,3 +14,4 @@ once_cell = "1.13.0" zenoh-config = { git = "https://github.com/eclipse-zenoh/zenoh.git", rev = "79a136e4fd90b11ff5d775ced981af53c4f1071b" } which = "4.3.0" uuid = { version = "1.2.1", features = ["serde"] } +dora-message = { path = "../message" } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index ab0e6b02..d6a2421c 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -1,15 +1,51 @@ -use crate::config::NodeId; +use crate::config::{DataId, NodeId}; #[derive(Debug, serde::Serialize, serde::Deserialize)] -pub enum Request { +pub enum ControlRequest { Register { node_id: NodeId }, - PrepareOutputMessage { len: usize }, + Subscribe { node_id: NodeId }, + PrepareOutputMessage { output_id: DataId, len: usize }, SendOutMessage { id: MessageId }, + Stopped, } +type MessageId = String; + #[derive(Debug, serde::Serialize, serde::Deserialize)] -pub enum Reply { - RegisterResult(Result<(), String>), +pub enum ControlReply { + Result(Result<(), String>), + PreparedMessage { id: MessageId, data: RawMutInput }, } -type MessageId = String; +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub enum NodeEvent { + Stop, + Input(RawInput), +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct RawInput { + shared_memory_pointer: (), // TODO + len: usize, +} + +impl RawInput { + pub fn get(&self) -> &[u8] { + &[] // TODO + } +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct RawMutInput { + shared_memory_pointer: (), // TODO + len: usize, +} + +impl RawMutInput { + pub fn get(&self) -> &[u8] { + &[] // TODO + } + pub fn get_mut(&self) -> &mut [u8] { + &mut [] // TODO + } +} From e36f15e3699176d0c1d50f967cff017c5f46fb1e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 30 Nov 2022 21:12:52 +0100 Subject: [PATCH 005/225] Add support to subscribe to messages in daemon --- binaries/daemon/Cargo.toml | 2 + binaries/daemon/src/main.rs | 289 ++++++++++++++++++-------- libraries/core/src/daemon_messages.rs | 15 +- libraries/message/Cargo.toml | 1 + libraries/message/src/lib.rs | 4 +- 5 files changed, 223 insertions(+), 88 deletions(-) diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 6512ce92..91445bc1 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -16,3 +16,5 @@ serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.86" shared_memory = "0.12.0" dora-core = { path = "../../libraries/core" } +dora-message = { path = "../../libraries/message" } +flume = "0.10.14" diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 84a9fa98..0f37fa5e 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,4 +1,9 @@ -use dora_core::{config::NodeId, daemon_messages, topics::DORA_DAEMON_PORT_DEFAULT}; +use dora_core::{ + config::{DataId, NodeId}, + daemon_messages::{self, ControlReply}, + topics::DORA_DAEMON_PORT_DEFAULT, +}; +use dora_message::{uhlc, Metadata}; use eyre::{eyre, Context}; use futures_concurrency::stream::Merge; use shared_memory::ShmemConf; @@ -6,7 +11,7 @@ use std::{collections::HashMap, io::ErrorKind, net::Ipv4Addr}; use tokio::{ io::{AsyncReadExt, AsyncWriteExt}, net::{TcpListener, TcpStream}, - sync::mpsc, + sync::{mpsc, oneshot}, }; use tokio_stream::{ wrappers::{ReceiverStream, TcpListenerStream}, @@ -15,6 +20,10 @@ use tokio_stream::{ #[tokio::main] async fn main() -> eyre::Result<()> { + main_inner().await +} + +async fn main_inner() -> eyre::Result<()> { set_up_tracing().wrap_err("failed to set up tracing subscriber")?; let localhost = Ipv4Addr::new(127, 0, 0, 1); @@ -44,93 +53,28 @@ async fn main() -> eyre::Result<()> { let node_events = ReceiverStream::new(node_events_rx); let mut events = (new_connections, node_events).merge(); + let hlc = uhlc::HLC::default(); let mut uninit_shared_memory = HashMap::new(); let mut sent_out_shared_memory = HashMap::new(); + let mut subscribe_channels = HashMap::new(); + while let Some(event) = events.next().await { match event { - Event::NewConnection(mut connection) => { + Event::NewConnection(connection) => { let events_tx = node_events_tx.clone(); - let mut id = None; - tokio::spawn(async move { - loop { - let raw = match tcp_receive(&mut connection).await { - Ok(data) => data, - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { - break; - } - Err(err) => { - tracing::error!("{err:?}"); - continue; - } - }; - let message: daemon_messages::ControlRequest = - match serde_json::from_slice(&raw) - .wrap_err("failed to deserialize node message") - { - Ok(e) => e, - Err(err) => { - tracing::warn!("{err:?}"); - continue; - } - }; - - let node_event = match message { - daemon_messages::ControlRequest::Register { node_id } => { - id = Some(node_id); - - let reply = daemon_messages::ControlReply::Result(Ok(())); - let serialized = serde_json::to_vec(&reply) - .wrap_err("failed to serialize register result"); - - let send_result = match serialized { - Err(err) => { - tracing::warn!("{err:?}"); - continue; - } - Ok(m) => tcp_send(&mut connection, &m).await, - }; - - match send_result { - Ok(()) => continue, - Err(err) => { - tracing::warn!("{err:?}"); - break; // close connection - } - } - } - daemon_messages::ControlRequest::PrepareOutputMessage { len } => { - NodeEvent::PrepareOutputMessage { len } - } - daemon_messages::ControlRequest::SendOutMessage { id } => { - NodeEvent::SendOutMessage { id } - } - }; - let event = Event::Node { - id: match &id { - Some(id) => id.clone(), - None => { - tracing::warn!( - "Ignoring node event because no register \ - message was sent yet: {node_event:?}" - ); - continue; - } - }, - event: node_event, - }; - let Ok(()) = events_tx.send(event).await else { - break; - }; - } - }); + tokio::spawn(handle_connection(connection, events_tx)); } Event::ConnectError(err) => { tracing::warn!("{:?}", err.wrap_err("failed to connect")); } - Event::Node { id, event } => match event { - NodeEvent::PrepareOutputMessage { len } => { + Event::Node { id, event, reply } => match event { + NodeEvent::Subscribe { event_sender } => { + subscribe_channels.insert(id, event_sender); + let _ = reply.send(ControlReply::Result(Ok(()))); + } + NodeEvent::PrepareOutputMessage { output_id, len } => { let memory = ShmemConf::new() .size(len) .create() @@ -145,12 +89,44 @@ async fn main() -> eyre::Result<()> { .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; + // TODO figure out receivers from dataflow graph + let local_receivers = &[]; + // TODO send shared memory ID to all local receivers + let mut closed = Vec::new(); + for receiver_id in local_receivers { + if let Some(channel) = subscribe_channels.get(receiver_id) { + let ptr = (); + let input_id = DataId::from("".to_owned()); + if channel + .send_async(daemon_messages::NodeEvent::Input { + id: input_id, + metadata: Metadata::new(hlc.new_timestamp()), // TODO + data: unsafe { + daemon_messages::RawInput::new(ptr, memory.len()) + }, + }) + .await + .is_err() + { + closed.push(receiver_id); + } + } + } + for id in closed { + subscribe_channels.remove(id); + } + // keep shared memory ptr in order to free it once all subscribers are done let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + sent_out_shared_memory.insert(id, memory); + // TODO send `data` via network to all remove receivers + } + NodeEvent::Stopped => { + // TODO send stop message to downstream nodes - sent_out_shared_memory.insert(id, memory); + let _ = reply.send(ControlReply::Result(Ok(()))); } }, } @@ -159,24 +135,167 @@ async fn main() -> eyre::Result<()> { Ok(()) } +async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sender) { + let mut id = None; + let mut enter_subscribe_loop = None; + loop { + // receive the next message and parse it + let raw = match tcp_receive(&mut connection).await { + Ok(data) => data, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + continue; + } + }; + let message: daemon_messages::ControlRequest = + match serde_json::from_slice(&raw).wrap_err("failed to deserialize node message") { + Ok(e) => e, + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle the message and translate it to a NodeEvent + let node_event = match message { + daemon_messages::ControlRequest::Register { node_id } => { + id = Some(node_id); + + let reply = daemon_messages::ControlReply::Result(Ok(())); + let serialized = serde_json::to_vec(&reply) + .wrap_err("failed to serialize register result") + .unwrap(); + + match tcp_send(&mut connection, &serialized).await { + Ok(()) => continue, // don't trigger an event for register calls + Err(err) => { + tracing::warn!("{err:?}"); + break; // close connection + } + } + } + daemon_messages::ControlRequest::Stopped => NodeEvent::Stopped, + daemon_messages::ControlRequest::PrepareOutputMessage { output_id, len } => { + NodeEvent::PrepareOutputMessage { output_id, len } + } + daemon_messages::ControlRequest::SendOutMessage { id } => { + NodeEvent::SendOutMessage { id } + } + daemon_messages::ControlRequest::Subscribe { node_id } => { + let (tx, rx) = flume::bounded(10); + + id = Some(node_id); + enter_subscribe_loop = Some(rx); + + NodeEvent::Subscribe { event_sender: tx } + } + }; + + // send NodeEvent to daemon main loop + let (reply_tx, reply) = oneshot::channel(); + let event = Event::Node { + id: match &id { + Some(id) => id.clone(), + None => { + tracing::warn!( + "Ignoring node event because no register \ + message was sent yet: {node_event:?}" + ); + continue; + } + }, + event: node_event, + reply: reply_tx, + }; + let Ok(()) = events_tx.send(event).await else { + break; + }; + + // wait for reply and send it out + let Ok(reply) = reply.await else { + break; // main loop exited + }; + let Ok(serialized) = serde_json::to_vec(&reply) else { + tracing::error!("failed to serialize reply"); + continue; + }; + match tcp_send(&mut connection, &serialized).await { + Ok(()) => {} + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + } + } + + // enter subscribe loop after receiving a subscribe message + if let Some(events) = enter_subscribe_loop { + subscribe_loop(connection, events).await; + break; // the subscribe loop only exits when the connection was closed + } + } +} + +async fn subscribe_loop( + mut connection: TcpStream, + events: flume::Receiver, +) { + while let Some(event) = events.stream().next().await { + let message = match serde_json::to_vec(&event) { + Ok(m) => m, + Err(err) => { + let err = eyre!(err).wrap_err("failed to serialize node event"); + tracing::warn!("{err:?}"); + continue; + } + }; + match tcp_send(&mut connection, &message).await { + Ok(()) => {} + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + } + } + } +} + enum Event { NewConnection(TcpStream), ConnectError(eyre::Report), - Node { id: NodeId, event: NodeEvent }, + Node { + id: NodeId, + event: NodeEvent, + reply: oneshot::Sender, + }, } #[derive(Debug)] pub enum NodeEvent { - PrepareOutputMessage { len: usize }, - SendOutMessage { id: MessageId }, + PrepareOutputMessage { + output_id: DataId, + len: usize, + }, + SendOutMessage { + id: MessageId, + }, + Stopped, + Subscribe { + event_sender: flume::Sender, + }, } type MessageId = String; -async fn tcp_send(connection: &mut TcpStream, request: &[u8]) -> std::io::Result<()> { - let len_raw = (request.len() as u64).to_le_bytes(); +async fn tcp_send(connection: &mut TcpStream, message: &[u8]) -> std::io::Result<()> { + let len_raw = (message.len() as u64).to_le_bytes(); connection.write_all(&len_raw).await?; - connection.write_all(request).await?; + connection.write_all(message).await?; Ok(()) } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index d6a2421c..c8c41226 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -1,3 +1,5 @@ +use dora_message::Metadata; + use crate::config::{DataId, NodeId}; #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -20,7 +22,11 @@ pub enum ControlReply { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum NodeEvent { Stop, - Input(RawInput), + Input { + id: DataId, + metadata: Metadata<'static>, + data: RawInput, // TODO add lifetime to borrow from inputs channel while RawInput exists + }, } #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -30,6 +36,13 @@ pub struct RawInput { } impl RawInput { + pub unsafe fn new(ptr: (), len: usize) -> Self { + Self { + shared_memory_pointer: ptr, + len, + } + } + pub fn get(&self) -> &[u8] { &[] // TODO } diff --git a/libraries/message/Cargo.toml b/libraries/message/Cargo.toml index 89fd0584..462c4fd5 100644 --- a/libraries/message/Cargo.toml +++ b/libraries/message/Cargo.toml @@ -14,6 +14,7 @@ build = false [dependencies] capnp = { version = "0.14.6", features = ["unaligned"] } uhlc = "0.5.1" +serde = { version = "1.0.136", features = ["derive"] } [build-dependencies] capnpc = "0.14" diff --git a/libraries/message/src/lib.rs b/libraries/message/src/lib.rs index 81ce83b0..f0bc6c8c 100644 --- a/libraries/message/src/lib.rs +++ b/libraries/message/src/lib.rs @@ -7,14 +7,14 @@ pub mod message_capnp { } pub use uhlc; -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct Metadata<'a> { metadata_version: u16, timestamp: uhlc::Timestamp, pub parameters: MetadataParameters<'a>, } -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] pub struct MetadataParameters<'a> { pub watermark: u64, pub deadline: u64, From 201fd228f5de3136da3037ff6018b2f826d02888 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 30 Nov 2022 21:13:33 +0100 Subject: [PATCH 006/225] Remove communication layer from dora-node-api --- Cargo.lock | 4 +++- apis/c++/node/Cargo.toml | 4 +--- apis/c/node/Cargo.toml | 1 - apis/rust/node/Cargo.toml | 5 +---- apis/rust/node/src/lib.rs | 34 ++++------------------------------ 5 files changed, 9 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6ab40956..8bd94b84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -970,7 +970,9 @@ name = "dora-daemon" version = "0.1.0" dependencies = [ "dora-core", + "dora-message", "eyre", + "flume", "futures-concurrency 7.0.0", "serde", "serde_json", @@ -1008,6 +1010,7 @@ version = "0.1.0" dependencies = [ "capnp", "capnpc", + "serde", "uhlc 0.5.1", ] @@ -1027,7 +1030,6 @@ name = "dora-node-api" version = "0.1.0" dependencies = [ "capnp", - "communication-layer-pub-sub", "dora-core", "dora-message", "eyre", diff --git a/apis/c++/node/Cargo.toml b/apis/c++/node/Cargo.toml index 3b7ab157..028d9b8c 100644 --- a/apis/c++/node/Cargo.toml +++ b/apis/c++/node/Cargo.toml @@ -10,9 +10,7 @@ crate-type = ["staticlib"] [dependencies] cxx = "1.0.73" -dora-node-api = { version = "0.1.0", path = "../../../apis/rust/node", default-features = false, features = [ - "zenoh", -] } +dora-node-api = { version = "0.1.0", path = "../../../apis/rust/node", default-features = false } eyre = "0.6.8" [build-dependencies] diff --git a/apis/c/node/Cargo.toml b/apis/c/node/Cargo.toml index 9351e75c..c38fc627 100644 --- a/apis/c/node/Cargo.toml +++ b/apis/c/node/Cargo.toml @@ -16,5 +16,4 @@ tracing = "0.1.33" [dependencies.dora-node-api] default-features = false -features = ["zenoh"] path = "../../rust/node" diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index 984f1b32..4269f81d 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -5,9 +5,7 @@ edition = "2021" license = "Apache-2.0" [features] -default = ["zenoh", "iceoryx", "tracing-subscriber"] -zenoh = ["communication-layer-pub-sub/zenoh"] -iceoryx = ["communication-layer-pub-sub/iceoryx"] +default = ["tracing-subscriber"] tracing-subscriber = ["dep:tracing-subscriber"] [dependencies] @@ -20,7 +18,6 @@ thiserror = "1.0.30" tracing = "0.1.33" tracing-subscriber = { version = "0.3.15", optional = true } flume = "0.10.14" -communication-layer-pub-sub = { path = "../../../libraries/communication-layer/pub-sub", default-features = false } uuid = { version = "1.1.2", features = ["v4"] } capnp = "0.14.9" dora-message = { path = "../../../libraries/message" } diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index a4e5ff4b..03cb5ff1 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,7 +1,6 @@ -use communication_layer_pub_sub::CommunicationLayer; use daemon::{ControlChannel, DaemonConnection, EventStream}; pub use dora_core; -use dora_core::config::{CommunicationConfig, DataId, NodeId, NodeRunConfig}; +use dora_core::config::{DataId, NodeId, NodeRunConfig}; pub use dora_message::{uhlc, Metadata, MetadataParameters}; use eyre::WrapErr; pub use flume::Receiver; @@ -30,19 +29,10 @@ impl DoraNode { .wrap_err("env variable DORA_NODE_RUN_CONFIG must be set")?; serde_yaml::from_str(&raw).context("failed to deserialize operator config")? }; - let communication_config = { - let raw = std::env::var("DORA_COMMUNICATION_CONFIG") - .wrap_err("env variable DORA_COMMUNICATION_CONFIG must be set")?; - serde_yaml::from_str(&raw).context("failed to deserialize communication config")? - }; - Self::init(id, node_config, communication_config) + Self::init(id, node_config) } - pub fn init( - id: NodeId, - node_config: NodeRunConfig, - communication_config: CommunicationConfig, - ) -> eyre::Result<(Self, EventStream)> { + pub fn init(id: NodeId, node_config: NodeRunConfig) -> eyre::Result<(Self, EventStream)> { let DaemonConnection { control_channel, event_stream, @@ -121,18 +111,6 @@ fn set_up_tracing() -> eyre::Result<()> { .context("failed to set tracing global subscriber") } -pub fn manual_stop_publisher( - communication: &mut dyn CommunicationLayer, -) -> eyre::Result Result<(), BoxError>> { - let hlc = dora_message::uhlc::HLC::default(); - let metadata = dora_message::Metadata::new(hlc.new_timestamp()); - let data = metadata.serialize().unwrap(); - let publisher = communication - .publisher(dora_core::topics::MANUAL_STOP) - .map_err(|err| eyre::eyre!(err))?; - Ok(move || publisher.publish(&data)) -} - #[cfg(test)] mod tests { use dora_core::config; @@ -146,12 +124,8 @@ mod tests { inputs: Default::default(), outputs: Default::default(), }; - let communication_config = config::CommunicationConfig::Zenoh { - config: Default::default(), - prefix: format!("/{}", uuid::Uuid::new_v4()), - }; - let (_node, events) = DoraNode::init(id, node_config, communication_config).unwrap(); + let (_node, events) = DoraNode::init(id, node_config).unwrap(); assert!(events.recv().is_err()); } From 1d2d71b15af0d028b3e4c9ce1b404d258e58c67c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 30 Nov 2022 21:15:31 +0100 Subject: [PATCH 007/225] Comment out uses of communication layer in coordinator for now --- binaries/coordinator/src/lib.rs | 27 ++++++++++++++------------- binaries/coordinator/src/run/mod.rs | 22 +++++++++++----------- binaries/runtime/Cargo.toml | 5 +---- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index aec5212e..c8d93ff3 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -7,7 +7,6 @@ use dora_core::{ StopDataflowResult, }, }; -use dora_node_api::{communication, manual_stop_publisher}; use eyre::{bail, eyre, WrapErr}; use futures::StreamExt; use futures_concurrency::stream::Merge; @@ -259,18 +258,20 @@ async fn stop_dataflow( Some(dataflow) => dataflow.communication_config.clone(), None => bail!("No running dataflow found with UUID `{uuid}`"), }; - let mut communication = - tokio::task::spawn_blocking(move || communication::init(&communication_config)) - .await - .wrap_err("failed to join communication layer init task")? - .wrap_err("failed to init communication layer")?; - tracing::info!("sending stop message to dataflow `{uuid}`"); - let manual_stop_publisher = manual_stop_publisher(communication.as_mut())?; - tokio::task::spawn_blocking(move || manual_stop_publisher()) - .await - .wrap_err("failed to join stop publish task")? - .map_err(|err| eyre!(err)) - .wrap_err("failed to send stop message")?; + + todo!(); + // let mut communication = + // tokio::task::spawn_blocking(move || communication::init(&communication_config)) + // .await + // .wrap_err("failed to join communication layer init task")? + // .wrap_err("failed to init communication layer")?; + // tracing::info!("sending stop message to dataflow `{uuid}`"); + // let manual_stop_publisher = manual_stop_publisher(communication.as_mut())?; + // tokio::task::spawn_blocking(move || manual_stop_publisher()) + // .await + // .wrap_err("failed to join stop publish task")? + // .map_err(|err| eyre!(err)) + // .wrap_err("failed to send stop message")?; Ok(()) } diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 7422a04b..17b3c902 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -3,7 +3,6 @@ use dora_core::{ config::{format_duration, CommunicationConfig, NodeId}, descriptor::{self, collect_dora_timers, CoreNodeKind, Descriptor}, }; -use dora_node_api::communication; use eyre::{bail, eyre, WrapErr}; use futures::{stream::FuturesUnordered, StreamExt}; use std::{env::consts::EXE_EXTENSION, path::Path}; @@ -93,11 +92,11 @@ pub async fn spawn_dataflow(runtime: &Path, dataflow_path: &Path) -> eyre::Resul } for interval in dora_timers { let communication_config = communication_config.clone(); - let mut communication = - tokio::task::spawn_blocking(move || communication::init(&communication_config)) - .await - .wrap_err("failed to join communication layer init task")? - .wrap_err("failed to init communication layer")?; + // let mut communication = + // tokio::task::spawn_blocking(move || communication::init(&communication_config)) + // .await + // .wrap_err("failed to join communication layer init task")? + // .wrap_err("failed to init communication layer")?; tokio::spawn(async move { let topic = { let duration = format_duration(interval); @@ -108,11 +107,12 @@ pub async fn spawn_dataflow(runtime: &Path, dataflow_path: &Path) -> eyre::Resul while (stream.next().await).is_some() { let metadata = dora_message::Metadata::new(hlc.new_timestamp()); let data = metadata.serialize().unwrap(); - communication - .publisher(&topic) - .unwrap() - .publish(&data) - .expect("failed to publish timer tick message"); + // communication + // .publisher(&topic) + // .unwrap() + // .publish(&data) + // .expect("failed to publish timer tick message"); + todo!() } }); } diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index ebf80c12..8d0debb6 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -8,10 +8,7 @@ license = "Apache-2.0" [dependencies] clap = { version = "3.1.12", features = ["derive"] } -dora-node-api = { path = "../../apis/rust/node", default-features = false, features = [ - "zenoh", - "iceoryx", -] } +dora-node-api = { path = "../../apis/rust/node", default-features = false } dora-operator-api-python = { path = "../../apis/python/operator" } dora-operator-api-types = { path = "../../apis/rust/operator/types" } dora-core = { version = "0.1.0", path = "../../libraries/core" } From d01dc38134c7c2bd93fd87e67d601a6bdc59c00f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 30 Nov 2022 21:16:20 +0100 Subject: [PATCH 008/225] Update nodes of rust example to new API --- examples/rust-dataflow/node/src/main.rs | 40 ++++++++++++++----------- examples/rust-dataflow/sink/src/main.rs | 40 ++++++++++++++----------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/examples/rust-dataflow/node/src/main.rs b/examples/rust-dataflow/node/src/main.rs index 7e8f92fb..ce7408df 100644 --- a/examples/rust-dataflow/node/src/main.rs +++ b/examples/rust-dataflow/node/src/main.rs @@ -1,32 +1,36 @@ -use dora_node_api::{self, dora_core::config::DataId, DoraNode}; +use dora_node_api::{ + self, + dora_core::{config::DataId, daemon_messages::NodeEvent}, + DoraNode, +}; fn main() -> eyre::Result<()> { let output = DataId::from("random".to_owned()); - let mut operator = DoraNode::init_from_env()?; - - let inputs = operator.inputs()?; + let (mut node, events) = DoraNode::init_from_env()?; for _ in 0..20 { - let input = match inputs.recv() { + let event = match events.recv() { Ok(input) => input, Err(_) => break, }; - match input.id.as_str() { - "tick" => { - let random: u64 = rand::random(); - let data: &[u8] = &random.to_le_bytes(); - operator.send_output( - &output, - input.metadata().parameters.clone(), - data.len(), - |out| { + match event { + NodeEvent::Stop => break, + NodeEvent::Input { + id, + metadata, + data: _, + } => match id.as_str() { + "tick" => { + let random: u64 = rand::random(); + let data: &[u8] = &random.to_le_bytes(); + node.send_output(output.clone(), metadata.parameters, data.len(), |out| { out.copy_from_slice(data); - }, - )?; - } - other => eprintln!("Ignoring unexpected input `{other}`"), + })?; + } + other => eprintln!("Ignoring unexpected input `{other}`"), + }, } } diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index f9c932a5..16091882 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -1,26 +1,30 @@ -use dora_node_api::{self, DoraNode}; +use dora_node_api::{self, dora_core::daemon_messages::NodeEvent, DoraNode}; use eyre::{bail, Context}; fn main() -> eyre::Result<()> { - let mut operator = DoraNode::init_from_env()?; + let (_node, events) = DoraNode::init_from_env()?; - let inputs = operator.inputs()?; - - while let Ok(input) = inputs.recv() { - match input.id.as_str() { - "message" => { - let data = input.data(); - let received_string = - std::str::from_utf8(&data).wrap_err("received message was not utf8-encoded")?; - println!("received message: {}", received_string); - if !received_string.starts_with("operator received random value ") { - bail!("unexpected message format (should start with 'operator received random value')") - } - if !received_string.ends_with(" ticks") { - bail!("unexpected message format (should end with 'ticks')") + while let Ok(event) = events.recv() { + match event { + NodeEvent::Stop => break, + NodeEvent::Input { + id, + metadata: _, + data, + } => match id.as_str() { + "message" => { + let received_string = std::str::from_utf8(data.get()) + .wrap_err("received message was not utf8-encoded")?; + println!("received message: {}", received_string); + if !received_string.starts_with("operator received random value ") { + bail!("unexpected message format (should start with 'operator received random value')") + } + if !received_string.ends_with(" ticks") { + bail!("unexpected message format (should end with 'ticks')") + } } - } - other => eprintln!("Ignoring unexpected input `{other}`"), + other => eprintln!("Ignoring unexpected input `{other}`"), + }, } } From 2e7ef8b6307d6c70831c93f619caa0a0f838b05d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 7 Dec 2022 16:11:02 +0100 Subject: [PATCH 009/225] Implement shared memory mapping in node API --- Cargo.lock | 2 + apis/rust/node/Cargo.toml | 1 + apis/rust/node/src/daemon.rs | 11 +++--- apis/rust/node/src/lib.rs | 15 ++++++-- binaries/daemon/src/main.rs | 39 ++++++++++++-------- examples/rust-dataflow/sink/src/main.rs | 3 +- libraries/core/Cargo.toml | 1 + libraries/core/src/daemon_messages.rs | 49 ++++++++++++------------- 8 files changed, 70 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8bd94b84..685f17ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -960,6 +960,7 @@ dependencies = [ "once_cell", "serde", "serde_yaml 0.9.11", + "shared_memory", "uuid 1.2.1", "which", "zenoh-config", @@ -1038,6 +1039,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml 0.8.23", + "shared_memory", "thiserror", "tokio", "tracing", diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index 4269f81d..63d731b6 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -22,6 +22,7 @@ uuid = { version = "1.1.2", features = ["v4"] } capnp = "0.14.9" dora-message = { path = "../../../libraries/message" } dora-core = { path = "../../../libraries/core" } +shared_memory = "0.12.0" [dev-dependencies] tokio = { version = "1.17.0", features = ["rt"] } diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index f304659b..3229c1c0 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -5,7 +5,7 @@ use std::{ use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{ControlRequest, NodeEvent, RawMutInput}, + daemon_messages::{ControlRequest, NodeEvent}, topics::DORA_DAEMON_PORT_DEFAULT, }; use eyre::{bail, eyre, Context}; @@ -63,9 +63,9 @@ impl ControlChannel { match tcp_receive(&mut self.0) .wrap_err("failed to receive PrepareOutputMessage reply from dora-daemon")? { - dora_core::daemon_messages::ControlReply::PreparedMessage { id, data } => { - Ok(MessageSample { id, data }) - } + dora_core::daemon_messages::ControlReply::PreparedMessage { + shared_memory_id: id, + } => Ok(MessageSample { id }), dora_core::daemon_messages::ControlReply::Result(Err(err)) => { Err(eyre!(err).wrap_err("failed to report stop event to dora-daemon")) } @@ -91,8 +91,7 @@ impl ControlChannel { } pub struct MessageSample { - id: String, - pub data: RawMutInput, + pub id: String, } fn init_event_stream(addr: Ipv4Addr, node_id: &NodeId) -> eyre::Result { diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 03cb5ff1..5f34f314 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -4,6 +4,7 @@ use dora_core::config::{DataId, NodeId, NodeRunConfig}; pub use dora_message::{uhlc, Metadata, MetadataParameters}; use eyre::WrapErr; pub use flume::Receiver; +use shared_memory::ShmemConf; pub mod daemon; @@ -71,9 +72,17 @@ impl DoraNode { .prepare_message(output_id.clone(), full_len) .wrap_err("failed to prepare sample for output message")?; - let raw = sample.data.get_mut(); - raw[..serialized_metadata.len()].copy_from_slice(&serialized_metadata); - data(&mut raw[serialized_metadata.len()..]); + // map shared memory and fill in data + { + let mut shared_memory = ShmemConf::new() + .os_id(&sample.id) + .open() + .wrap_err("failed to open shared memory sample")?; + + let raw = unsafe { shared_memory.as_slice_mut() }; + raw[..serialized_metadata.len()].copy_from_slice(&serialized_metadata); + data(&mut raw[serialized_metadata.len()..]); + } self.control_channel .send_message(sample) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 0f37fa5e..86552c2b 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -69,10 +69,14 @@ async fn main_inner() -> eyre::Result<()> { Event::ConnectError(err) => { tracing::warn!("{:?}", err.wrap_err("failed to connect")); } - Event::Node { id, event, reply } => match event { + Event::Node { + id, + event, + reply_sender, + } => match event { NodeEvent::Subscribe { event_sender } => { subscribe_channels.insert(id, event_sender); - let _ = reply.send(ControlReply::Result(Ok(()))); + let _ = reply_sender.send(ControlReply::Result(Ok(()))); } NodeEvent::PrepareOutputMessage { output_id, len } => { let memory = ShmemConf::new() @@ -80,9 +84,15 @@ async fn main_inner() -> eyre::Result<()> { .create() .wrap_err("failed to allocate shared memory")?; let id = memory.get_os_id().to_owned(); - uninit_shared_memory.insert(id, memory); - - // TODO send reply with id + uninit_shared_memory.insert(id.clone(), memory); + + let reply = ControlReply::PreparedMessage { + shared_memory_id: id.clone(), + }; + if reply_sender.send(reply).is_err() { + // free shared memory slice again + uninit_shared_memory.remove(&id); + } } NodeEvent::SendOutMessage { id } => { let memory = uninit_shared_memory @@ -92,19 +102,16 @@ async fn main_inner() -> eyre::Result<()> { // TODO figure out receivers from dataflow graph let local_receivers = &[]; - // TODO send shared memory ID to all local receivers + // send shared memory ID to all local receivers let mut closed = Vec::new(); for receiver_id in local_receivers { if let Some(channel) = subscribe_channels.get(receiver_id) { - let ptr = (); let input_id = DataId::from("".to_owned()); if channel .send_async(daemon_messages::NodeEvent::Input { id: input_id, metadata: Metadata::new(hlc.new_timestamp()), // TODO - data: unsafe { - daemon_messages::RawInput::new(ptr, memory.len()) - }, + data: unsafe { daemon_messages::InputData::new(id.clone()) }, }) .await .is_err() @@ -117,16 +124,16 @@ async fn main_inner() -> eyre::Result<()> { subscribe_channels.remove(id); } - // keep shared memory ptr in order to free it once all subscribers are done + // TODO send `data` via network to all remove receivers let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); - sent_out_shared_memory.insert(id, memory); - // TODO send `data` via network to all remove receivers + // keep shared memory ptr in order to free it once all subscribers are done + sent_out_shared_memory.insert(id, memory); } NodeEvent::Stopped => { // TODO send stop message to downstream nodes - let _ = reply.send(ControlReply::Result(Ok(()))); + let _ = reply_sender.send(ControlReply::Result(Ok(()))); } }, } @@ -208,7 +215,7 @@ async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sender, + reply_sender: oneshot::Sender, }, } diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index 16091882..ee12b7f0 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -13,7 +13,8 @@ fn main() -> eyre::Result<()> { data, } => match id.as_str() { "message" => { - let received_string = std::str::from_utf8(data.get()) + let data = data.map()?; + let received_string = std::str::from_utf8(&data) .wrap_err("received message was not utf8-encoded")?; println!("received message: {}", received_string); if !received_string.starts_with("operator received random value ") { diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 87d0d50d..4c5dcf61 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -15,3 +15,4 @@ zenoh-config = { git = "https://github.com/eclipse-zenoh/zenoh.git", rev = "79a1 which = "4.3.0" uuid = { version = "1.2.1", features = ["serde"] } dora-message = { path = "../message" } +shared_memory = "0.12.0" diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index c8c41226..375a53df 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -1,4 +1,6 @@ use dora_message::Metadata; +use eyre::Context; +use shared_memory::{Shmem, ShmemConf}; use crate::config::{DataId, NodeId}; @@ -7,16 +9,16 @@ pub enum ControlRequest { Register { node_id: NodeId }, Subscribe { node_id: NodeId }, PrepareOutputMessage { output_id: DataId, len: usize }, - SendOutMessage { id: MessageId }, + SendOutMessage { id: SharedMemoryId }, Stopped, } -type MessageId = String; +type SharedMemoryId = String; #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum ControlReply { Result(Result<(), String>), - PreparedMessage { id: MessageId, data: RawMutInput }, + PreparedMessage { shared_memory_id: SharedMemoryId }, } #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -25,40 +27,37 @@ pub enum NodeEvent { Input { id: DataId, metadata: Metadata<'static>, - data: RawInput, // TODO add lifetime to borrow from inputs channel while RawInput exists + data: InputData, }, } #[derive(Debug, serde::Serialize, serde::Deserialize)] -pub struct RawInput { - shared_memory_pointer: (), // TODO - len: usize, +pub struct InputData { + shared_memory_id: SharedMemoryId, } -impl RawInput { - pub unsafe fn new(ptr: (), len: usize) -> Self { - Self { - shared_memory_pointer: ptr, - len, - } +impl InputData { + pub unsafe fn new(shared_memory_id: SharedMemoryId) -> Self { + Self { shared_memory_id } } - pub fn get(&self) -> &[u8] { - &[] // TODO + pub fn map(self) -> eyre::Result { + let memory = ShmemConf::new() + .os_id(self.shared_memory_id) + .open() + .wrap_err("failed to map shared memory input")?; + Ok(MappedInputData { memory }) } } -#[derive(Debug, serde::Serialize, serde::Deserialize)] -pub struct RawMutInput { - shared_memory_pointer: (), // TODO - len: usize, +pub struct MappedInputData { + memory: Shmem, } -impl RawMutInput { - pub fn get(&self) -> &[u8] { - &[] // TODO - } - pub fn get_mut(&self) -> &mut [u8] { - &mut [] // TODO +impl std::ops::Deref for MappedInputData { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + unsafe { self.memory.as_slice() } } } From 4d291927a186dbbc1115e2ffbbc4e1c6b80c9d06 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 16:27:03 +0100 Subject: [PATCH 010/225] Define new message types --- libraries/core/src/coordinator_messages.rs | 21 +++++++++++++++++++++ libraries/core/src/daemon_messages.rs | 8 +++++++- libraries/core/src/lib.rs | 1 + libraries/core/src/topics.rs | 2 +- 4 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 libraries/core/src/coordinator_messages.rs diff --git a/libraries/core/src/coordinator_messages.rs b/libraries/core/src/coordinator_messages.rs new file mode 100644 index 00000000..5c85e754 --- /dev/null +++ b/libraries/core/src/coordinator_messages.rs @@ -0,0 +1,21 @@ +use eyre::eyre; + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub enum CoordinatorRequest { + Register { machine_id: String }, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub enum RegisterResult { + Ok, + Err(String), +} + +impl RegisterResult { + pub fn to_result(self) -> eyre::Result<()> { + match self { + RegisterResult::Ok => Ok(()), + RegisterResult::Err(err) => Err(eyre!(err)), + } + } +} diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 375a53df..df3a849a 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -1,8 +1,14 @@ +use crate::config::{DataId, NodeId, NodeRunConfig}; use dora_message::Metadata; use eyre::Context; use shared_memory::{Shmem, ShmemConf}; -use crate::config::{DataId, NodeId}; +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct NodeConfig { + pub node_id: NodeId, + pub run_config: NodeRunConfig, + pub daemon_port: u16, +} #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum ControlRequest { diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index b4b20cc0..a96517dc 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -5,6 +5,7 @@ use std::{ }; pub mod config; +pub mod coordinator_messages; pub mod daemon_messages; pub mod descriptor; pub mod topics; diff --git a/libraries/core/src/topics.rs b/libraries/core/src/topics.rs index ca05fb41..23463264 100644 --- a/libraries/core/src/topics.rs +++ b/libraries/core/src/topics.rs @@ -5,7 +5,7 @@ use std::{ }; use uuid::Uuid; -pub const DORA_DAEMON_PORT_DEFAULT: u16 = 0xD02A; +pub const DORA_COORDINATOR_PORT_DEFAULT: u16 = 0xD02A; pub const MANUAL_STOP: &str = "dora/stop"; From c58745be8530fbb998db7460198d2d5cc53b3b76 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 16:28:19 +0100 Subject: [PATCH 011/225] Listen for daemon messages in coordinator --- binaries/coordinator/Cargo.toml | 2 +- binaries/coordinator/src/lib.rs | 75 ++++++++++++++++++++++++--- binaries/coordinator/src/listener.rs | 55 ++++++++++++++++++++ binaries/coordinator/src/tcp_utils.rs | 22 ++++++++ 4 files changed, 146 insertions(+), 8 deletions(-) create mode 100644 binaries/coordinator/src/listener.rs create mode 100644 binaries/coordinator/src/tcp_utils.rs diff --git a/binaries/coordinator/Cargo.toml b/binaries/coordinator/Cargo.toml index af5ef556..0b035824 100644 --- a/binaries/coordinator/Cargo.toml +++ b/binaries/coordinator/Cargo.toml @@ -14,7 +14,7 @@ futures = "0.3.21" serde = { version = "1.0.136", features = ["derive"] } serde_yaml = "0.8.23" tokio = { version = "1.21.2", features = ["full"] } -tokio-stream = { version = "0.1.8", features = ["io-util"] } +tokio-stream = { version = "0.1.8", features = ["io-util", "net"] } tokio-util = { version = "0.7.1", features = ["codec"] } clap = { version = "3.1.8", features = ["derive"] } uuid = { version = "1.2.1" } diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index c8d93ff3..a63965ae 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -1,13 +1,14 @@ -use crate::run::spawn_dataflow; +use crate::{run::spawn_dataflow, tcp_utils::tcp_send}; use control::ControlEvent; use dora_core::{ config::CommunicationConfig, + coordinator_messages::RegisterResult, topics::{ control_socket_addr, ControlRequest, DataflowId, ListDataflowResult, StartDataflowResult, - StopDataflowResult, + StopDataflowResult, DORA_COORDINATOR_PORT_DEFAULT, }, }; -use eyre::{bail, eyre, WrapErr}; +use eyre::{bail, WrapErr}; use futures::StreamExt; use futures_concurrency::stream::Merge; use run::{await_tasks, SpawnedDataflow}; @@ -15,11 +16,14 @@ use std::{ collections::HashMap, path::{Path, PathBuf}, }; -use tokio_stream::wrappers::ReceiverStream; +use tokio::net::TcpStream; +use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; use uuid::Uuid; mod control; +mod listener; mod run; +mod tcp_utils; #[derive(Debug, Clone, clap::Parser)] #[clap(about = "Dora coordinator")] @@ -61,22 +65,69 @@ pub async fn run(args: Args) -> eyre::Result<()> { } async fn start(runtime_path: &Path) -> eyre::Result<()> { + let listener = listener::create_listener(DORA_COORDINATOR_PORT_DEFAULT).await?; + let new_daemon_connections = TcpListenerStream::new(listener).map(|c| { + c.map(Event::NewDaemonConnection) + .wrap_err("failed to open connection") + .unwrap_or_else(Event::DaemonConnectError) + }); + let (dataflow_events_tx, dataflow_events) = tokio::sync::mpsc::channel(2); let mut dataflow_events_tx = Some(dataflow_events_tx); let dataflow_events = ReceiverStream::new(dataflow_events); + let (daemon_events_tx, daemon_events) = tokio::sync::mpsc::channel(2); + let daemon_events = ReceiverStream::new(daemon_events); + let (control_events, control_events_abort) = futures::stream::abortable( control::control_events(control_socket_addr()) .await .wrap_err("failed to create control events")?, ); - let mut events = (dataflow_events, control_events).merge(); + let mut events = ( + new_daemon_connections, + daemon_events, + dataflow_events, + control_events, + ) + .merge(); let mut running_dataflows = HashMap::new(); + let mut daemon_connections = HashMap::new(); while let Some(event) = events.next().await { match event { + Event::NewDaemonConnection(connection) => { + let events_tx = daemon_events_tx.clone(); + tokio::spawn(listener::handle_connection(connection, events_tx)); + } + Event::DaemonConnectError(err) => { + tracing::warn!("{:?}", err.wrap_err("failed to connect to dora-daemon")); + } + Event::Daemon(event) => match event { + DaemonEvent::Register { + machine_id, + mut connection, + } => match daemon_connections.entry(machine_id) { + std::collections::hash_map::Entry::Vacant(entry) => { + let reply = RegisterResult::Ok; + if tcp_send(&mut connection, &serde_json::to_vec(&reply)?) + .await + .is_ok() + { + entry.insert(connection); + } + } + std::collections::hash_map::Entry::Occupied(entry) => { + let reply = RegisterResult::Err(format!( + "there is already a daemon connection for machine `{}`", + entry.key() + )); + let _ = tcp_send(&mut connection, &serde_json::to_vec(&reply)?).await; + } + }, + }, Event::Dataflow { uuid, event } => match event { DataflowEvent::Finished { result } => { running_dataflows.remove(&uuid); @@ -312,11 +363,21 @@ async fn start_dataflow( }) } -enum Event { +pub enum Event { + NewDaemonConnection(TcpStream), + DaemonConnectError(eyre::Report), Dataflow { uuid: Uuid, event: DataflowEvent }, Control(ControlEvent), + Daemon(DaemonEvent), } -enum DataflowEvent { +pub enum DataflowEvent { Finished { result: eyre::Result<()> }, } + +pub enum DaemonEvent { + Register { + machine_id: String, + connection: TcpStream, + }, +} diff --git a/binaries/coordinator/src/listener.rs b/binaries/coordinator/src/listener.rs new file mode 100644 index 00000000..63164866 --- /dev/null +++ b/binaries/coordinator/src/listener.rs @@ -0,0 +1,55 @@ +use crate::{tcp_utils::tcp_receive, DaemonEvent, Event}; +use dora_core::coordinator_messages; +use eyre::Context; +use std::{io::ErrorKind, net::Ipv4Addr}; +use tokio::{ + net::{TcpListener, TcpStream}, + sync::mpsc, +}; + +pub async fn create_listener(port: u16) -> eyre::Result { + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let socket = match TcpListener::bind((localhost, port)).await { + Ok(socket) => socket, + Err(err) => { + return Err(eyre::Report::new(err).wrap_err("failed to create local TCP listener")) + } + }; + Ok(socket) +} + +pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sender) { + loop { + // receive the next message and parse it + let raw = match tcp_receive(&mut connection).await { + Ok(data) => data, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + continue; + } + }; + let message: coordinator_messages::CoordinatorRequest = + match serde_json::from_slice(&raw).wrap_err("failed to deserialize node message") { + Ok(e) => e, + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle the message and translate it to a DaemonEvent + match message { + coordinator_messages::CoordinatorRequest::Register { machine_id } => { + let event = DaemonEvent::Register { + machine_id, + connection, + }; + let _ = events_tx.send(Event::Daemon(event)).await; + break; + } + }; + } +} diff --git a/binaries/coordinator/src/tcp_utils.rs b/binaries/coordinator/src/tcp_utils.rs new file mode 100644 index 00000000..31f5e3b5 --- /dev/null +++ b/binaries/coordinator/src/tcp_utils.rs @@ -0,0 +1,22 @@ +use tokio::{ + io::{AsyncReadExt, AsyncWriteExt}, + net::TcpStream, +}; + +pub async fn tcp_send(connection: &mut TcpStream, message: &[u8]) -> std::io::Result<()> { + let len_raw = (message.len() as u64).to_le_bytes(); + connection.write_all(&len_raw).await?; + connection.write_all(message).await?; + Ok(()) +} + +pub async fn tcp_receive(connection: &mut TcpStream) -> std::io::Result> { + let reply_len = { + let mut raw = [0; 8]; + connection.read_exact(&mut raw).await?; + u64::from_le_bytes(raw) as usize + }; + let mut reply = vec![0; reply_len]; + connection.read_exact(&mut reply).await?; + Ok(reply) +} From 3312af19412dc4123810cf7d8bc47da5ecee5531 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 16:29:09 +0100 Subject: [PATCH 012/225] Connect to dora-coordinator and implement command to spawn nodes --- binaries/daemon/Cargo.toml | 2 + binaries/daemon/src/coordinator.rs | 60 ++++ binaries/daemon/src/listener.rs | 153 +++++++++++ binaries/daemon/src/main.rs | 423 ++++++++++++----------------- binaries/daemon/src/spawn.rs | 78 ++++++ binaries/daemon/src/tcp_utils.rs | 22 ++ 6 files changed, 492 insertions(+), 246 deletions(-) create mode 100644 binaries/daemon/src/coordinator.rs create mode 100644 binaries/daemon/src/listener.rs create mode 100644 binaries/daemon/src/spawn.rs create mode 100644 binaries/daemon/src/tcp_utils.rs diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 91445bc1..530c7a09 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -18,3 +18,5 @@ shared_memory = "0.12.0" dora-core = { path = "../../libraries/core" } dora-message = { path = "../../libraries/message" } flume = "0.10.14" +dora-download = { path = "../../libraries/extensions/download" } +serde_yaml = "0.8.23" diff --git a/binaries/daemon/src/coordinator.rs b/binaries/daemon/src/coordinator.rs new file mode 100644 index 00000000..457b79c0 --- /dev/null +++ b/binaries/daemon/src/coordinator.rs @@ -0,0 +1,60 @@ +use crate::{ + tcp_utils::{tcp_receive, tcp_send}, + DaemonCoordinatorEvent, +}; +use dora_core::coordinator_messages::{CoordinatorRequest, RegisterResult}; +use eyre::{eyre, Context}; +use std::{io::ErrorKind, net::SocketAddr}; +use tokio::{net::TcpStream, sync::mpsc}; +use tokio_stream::{wrappers::ReceiverStream, Stream}; + +pub async fn connect(addr: SocketAddr) -> eyre::Result> { + let mut stream = TcpStream::connect(addr) + .await + .wrap_err("failed to connect to dora-coordinator")?; + let register = serde_json::to_vec(&CoordinatorRequest::Register { + machine_id: String::new(), // TODO + })?; + tcp_send(&mut stream, ®ister) + .await + .wrap_err("failed to send register request to dora-coordinator")?; + let reply_raw = tcp_receive(&mut stream) + .await + .wrap_err("failed to register reply from dora-coordinator")?; + let result: RegisterResult = serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize dora-coordinator reply")?; + result.to_result()?; + tracing::info!("Connected to dora-coordinator at {:?}", addr); + + let (tx, rx) = mpsc::channel(1); + tokio::spawn(async move { + loop { + let event = match tcp_receive(&mut stream).await { + Ok(raw) => match serde_json::from_slice(&raw) { + Ok(event) => event, + Err(err) => { + let err = + eyre!(err).wrap_err("failed to deserialize incoming coordinator event"); + tracing::warn!("{err:?}"); + continue; + } + }, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => break, + Err(err) => { + let err = eyre!(err).wrap_err("failed to receive incoming event"); + tracing::warn!("{err:?}"); + continue; + } + }; + match tx.send(event).await { + Ok(()) => {} + Err(_) => { + // receiving end of channel was closed + break; + } + } + } + }); + + Ok(ReceiverStream::new(rx)) +} diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs new file mode 100644 index 00000000..eb42399d --- /dev/null +++ b/binaries/daemon/src/listener.rs @@ -0,0 +1,153 @@ +use crate::{ + tcp_utils::{tcp_receive, tcp_send}, + DaemonNodeEvent, Event, +}; +use dora_core::daemon_messages; +use eyre::{eyre, Context}; +use std::{io::ErrorKind, net::Ipv4Addr}; +use tokio::{ + net::{TcpListener, TcpStream}, + sync::{mpsc, oneshot}, +}; +use tokio_stream::StreamExt; + +pub async fn create_listener() -> eyre::Result { + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let socket = match TcpListener::bind((localhost, 0)).await { + Ok(socket) => socket, + Err(err) => { + return Err(eyre::Report::new(err).wrap_err("failed to create local TCP listener")) + } + }; + Ok(socket) +} + +pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sender) { + let mut id = None; + let mut enter_subscribe_loop = None; + loop { + // receive the next message and parse it + let raw = match tcp_receive(&mut connection).await { + Ok(data) => data, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + continue; + } + }; + let message: daemon_messages::ControlRequest = + match serde_json::from_slice(&raw).wrap_err("failed to deserialize node message") { + Ok(e) => e, + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle the message and translate it to a NodeEvent + let node_event = match message { + daemon_messages::ControlRequest::Register { node_id } => { + id = Some(node_id); + + let reply = daemon_messages::ControlReply::Result(Ok(())); + let serialized = serde_json::to_vec(&reply) + .wrap_err("failed to serialize register result") + .unwrap(); + + match tcp_send(&mut connection, &serialized).await { + Ok(()) => continue, // don't trigger an event for register calls + Err(err) => { + tracing::warn!("{err:?}"); + break; // close connection + } + } + } + daemon_messages::ControlRequest::Stopped => DaemonNodeEvent::Stopped, + daemon_messages::ControlRequest::PrepareOutputMessage { output_id, len } => { + DaemonNodeEvent::PrepareOutputMessage { output_id, len } + } + daemon_messages::ControlRequest::SendOutMessage { id } => { + DaemonNodeEvent::SendOutMessage { id } + } + daemon_messages::ControlRequest::Subscribe { node_id } => { + let (tx, rx) = flume::bounded(10); + + id = Some(node_id); + enter_subscribe_loop = Some(rx); + + DaemonNodeEvent::Subscribe { event_sender: tx } + } + }; + + // send NodeEvent to daemon main loop + let (reply_tx, reply) = oneshot::channel(); + let event = Event::Node { + id: match &id { + Some(id) => id.clone(), + None => { + tracing::warn!( + "Ignoring node event because no register \ + message was sent yet: {node_event:?}" + ); + continue; + } + }, + event: node_event, + reply_sender: reply_tx, + }; + let Ok(()) = events_tx.send(event).await else { + break; + }; + + // wait for reply and send it out + let Ok(reply) = reply.await else { + break; // main loop exited + }; + let Ok(serialized) = serde_json::to_vec(&reply) else { + tracing::error!("failed to serialize reply"); + continue; + }; + match tcp_send(&mut connection, &serialized).await { + Ok(()) => {} + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + } + } + + // enter subscribe loop after receiving a subscribe message + if let Some(events) = enter_subscribe_loop { + subscribe_loop(connection, events).await; + break; // the subscribe loop only exits when the connection was closed + } + } +} + +async fn subscribe_loop( + mut connection: TcpStream, + events: flume::Receiver, +) { + while let Some(event) = events.stream().next().await { + let message = match serde_json::to_vec(&event) { + Ok(m) => m, + Err(err) => { + let err = eyre!(err).wrap_err("failed to serialize node event"); + tracing::warn!("{err:?}"); + continue; + } + }; + match tcp_send(&mut connection, &message).await { + Ok(()) => {} + Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + break; + } + Err(err) => { + tracing::error!("{err:?}"); + } + } + } +} diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 86552c2b..c76251fa 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,289 +1,225 @@ use dora_core::{ config::{DataId, NodeId}, daemon_messages::{self, ControlReply}, - topics::DORA_DAEMON_PORT_DEFAULT, + descriptor, + topics::DORA_COORDINATOR_PORT_DEFAULT, }; use dora_message::{uhlc, Metadata}; use eyre::{eyre, Context}; use futures_concurrency::stream::Merge; -use shared_memory::ShmemConf; -use std::{collections::HashMap, io::ErrorKind, net::Ipv4Addr}; +use shared_memory::{Shmem, ShmemConf}; +use std::{ + collections::{BTreeMap, HashMap}, + net::{Ipv4Addr, SocketAddr}, + path::PathBuf, +}; use tokio::{ - io::{AsyncReadExt, AsyncWriteExt}, - net::{TcpListener, TcpStream}, + net::TcpStream, sync::{mpsc, oneshot}, }; use tokio_stream::{ wrappers::{ReceiverStream, TcpListenerStream}, - StreamExt, + Stream, StreamExt, }; +mod coordinator; +mod listener; +mod spawn; +mod tcp_utils; + #[tokio::main] async fn main() -> eyre::Result<()> { - main_inner().await + // the tokio::main proc macro confuses some tools such as rust-analyzer, so + // directly invoke a "normal" async function + run().await } -async fn main_inner() -> eyre::Result<()> { +async fn run() -> eyre::Result<()> { set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + tracing::info!("Starting in local mode"); let localhost = Ipv4Addr::new(127, 0, 0, 1); - let socket = match TcpListener::bind((localhost, DORA_DAEMON_PORT_DEFAULT)).await { - Ok(socket) => socket, - Err(err) if err.kind() == ErrorKind::AddrInUse => { - eyre::bail!( - "port {DORA_DAEMON_PORT_DEFAULT} is already in use. \ - Is `dora-daemon` already running?" - ); - } - Err(err) => { - return Err(eyre::Report::new(err).wrap_err(format!( - "failed to listen on port {DORA_DAEMON_PORT_DEFAULT}" - ))) - } - }; - - // TODO: set up connection to coordinator + let coordinator_socket = (localhost, DORA_COORDINATOR_PORT_DEFAULT); - let new_connections = TcpListenerStream::new(socket).map(|c| { - c.map(Event::NewConnection) - .wrap_err("failed to open connection") - .unwrap_or_else(Event::ConnectError) - }); - let (node_events_tx, node_events_rx) = mpsc::channel(10); - let node_events = ReceiverStream::new(node_events_rx); - - let mut events = (new_connections, node_events).merge(); - let hlc = uhlc::HLC::default(); - - let mut uninit_shared_memory = HashMap::new(); - let mut sent_out_shared_memory = HashMap::new(); + Daemon::run(coordinator_socket.into()).await +} - let mut subscribe_channels = HashMap::new(); +struct Daemon { + port: u16, + hlc: uhlc::HLC, + uninit_shared_memory: HashMap, + sent_out_shared_memory: HashMap, + subscribe_channels: HashMap>, - while let Some(event) = events.next().await { - match event { - Event::NewConnection(connection) => { - let events_tx = node_events_tx.clone(); - tokio::spawn(handle_connection(connection, events_tx)); - } - Event::ConnectError(err) => { - tracing::warn!("{:?}", err.wrap_err("failed to connect")); - } - Event::Node { - id, - event, - reply_sender, - } => match event { - NodeEvent::Subscribe { event_sender } => { - subscribe_channels.insert(id, event_sender); - let _ = reply_sender.send(ControlReply::Result(Ok(()))); - } - NodeEvent::PrepareOutputMessage { output_id, len } => { - let memory = ShmemConf::new() - .size(len) - .create() - .wrap_err("failed to allocate shared memory")?; - let id = memory.get_os_id().to_owned(); - uninit_shared_memory.insert(id.clone(), memory); - - let reply = ControlReply::PreparedMessage { - shared_memory_id: id.clone(), - }; - if reply_sender.send(reply).is_err() { - // free shared memory slice again - uninit_shared_memory.remove(&id); - } - } - NodeEvent::SendOutMessage { id } => { - let memory = uninit_shared_memory - .remove(&id) - .ok_or_else(|| eyre!("invalid shared memory id"))?; + node_tasks: HashMap>>, +} - // TODO figure out receivers from dataflow graph - let local_receivers = &[]; +impl Daemon { + pub async fn run(coordinator_addr: SocketAddr) -> eyre::Result<()> { + // connect to the coordinator + let coordinator_events = coordinator::connect(coordinator_addr) + .await + .wrap_err("failed to connect to dora-coordinator")? + .map(Event::Coordinator); + + // create listener for node connection + let listener = listener::create_listener().await?; + let port = listener + .local_addr() + .wrap_err("failed to get local addr of listener")? + .port(); + let new_connections = TcpListenerStream::new(listener).map(|c| { + c.map(Event::NewConnection) + .wrap_err("failed to open connection") + .unwrap_or_else(Event::ConnectError) + }); + tracing::info!("Listening for node connections on 127.0.0.1:{port}"); + + let daemon = Self { + port, + hlc: uhlc::HLC::default(), + uninit_shared_memory: Default::default(), + sent_out_shared_memory: Default::default(), + subscribe_channels: Default::default(), + node_tasks: HashMap::new(), + }; + let events = (coordinator_events, new_connections).merge(); + daemon.run_inner(events).await + } - // send shared memory ID to all local receivers - let mut closed = Vec::new(); - for receiver_id in local_receivers { - if let Some(channel) = subscribe_channels.get(receiver_id) { - let input_id = DataId::from("".to_owned()); - if channel - .send_async(daemon_messages::NodeEvent::Input { - id: input_id, - metadata: Metadata::new(hlc.new_timestamp()), // TODO - data: unsafe { daemon_messages::InputData::new(id.clone()) }, - }) - .await - .is_err() - { - closed.push(receiver_id); - } - } - } - for id in closed { - subscribe_channels.remove(id); - } + async fn run_inner( + mut self, + incoming_events: impl Stream + Unpin, + ) -> eyre::Result<()> { + let (node_events_tx, node_events_rx) = mpsc::channel(10); + let node_events = ReceiverStream::new(node_events_rx); - // TODO send `data` via network to all remove receivers - let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + let mut events = (incoming_events, node_events).merge(); - // keep shared memory ptr in order to free it once all subscribers are done - sent_out_shared_memory.insert(id, memory); + while let Some(event) = events.next().await { + match event { + Event::NewConnection(connection) => { + let events_tx = node_events_tx.clone(); + tokio::spawn(listener::handle_connection(connection, events_tx)); } - NodeEvent::Stopped => { - // TODO send stop message to downstream nodes - - let _ = reply_sender.send(ControlReply::Result(Ok(()))); + Event::ConnectError(err) => { + tracing::warn!("{:?}", err.wrap_err("failed to connect")); } - }, + Event::Coordinator(event) => self.handle_coordinator_event(event).await?, + Event::Node { + id, + event, + reply_sender, + } => self.handle_node_event(event, id, reply_sender).await?, + } } - } - Ok(()) -} + Ok(()) + } -async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sender) { - let mut id = None; - let mut enter_subscribe_loop = None; - loop { - // receive the next message and parse it - let raw = match tcp_receive(&mut connection).await { - Ok(data) => data, - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { - break; + async fn handle_coordinator_event( + &mut self, + event: DaemonCoordinatorEvent, + ) -> Result<(), eyre::ErrReport> { + match event { + DaemonCoordinatorEvent::Spawn(spawn_command) => { + let node_id = spawn_command.node_id.clone(); + let task = spawn::spawn_node(spawn_command, self.port) + .await + .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; + self.node_tasks.insert(node_id, task); + Ok(()) } - Err(err) => { - tracing::error!("{err:?}"); - continue; + } + } + + async fn handle_node_event( + &mut self, + event: DaemonNodeEvent, + id: NodeId, + reply_sender: oneshot::Sender, + ) -> Result<(), eyre::ErrReport> { + match event { + DaemonNodeEvent::Subscribe { event_sender } => { + self.subscribe_channels.insert(id, event_sender); + let _ = reply_sender.send(ControlReply::Result(Ok(()))); } - }; - let message: daemon_messages::ControlRequest = - match serde_json::from_slice(&raw).wrap_err("failed to deserialize node message") { - Ok(e) => e, - Err(err) => { - tracing::warn!("{err:?}"); - continue; + DaemonNodeEvent::PrepareOutputMessage { output_id, len } => { + let memory = ShmemConf::new() + .size(len) + .create() + .wrap_err("failed to allocate shared memory")?; + let id = memory.get_os_id().to_owned(); + self.uninit_shared_memory.insert(id.clone(), memory); + + let reply = ControlReply::PreparedMessage { + shared_memory_id: id.clone(), + }; + if reply_sender.send(reply).is_err() { + // free shared memory slice again + self.uninit_shared_memory.remove(&id); } - }; - - // handle the message and translate it to a NodeEvent - let node_event = match message { - daemon_messages::ControlRequest::Register { node_id } => { - id = Some(node_id); - - let reply = daemon_messages::ControlReply::Result(Ok(())); - let serialized = serde_json::to_vec(&reply) - .wrap_err("failed to serialize register result") - .unwrap(); - - match tcp_send(&mut connection, &serialized).await { - Ok(()) => continue, // don't trigger an event for register calls - Err(err) => { - tracing::warn!("{err:?}"); - break; // close connection + } + DaemonNodeEvent::SendOutMessage { id } => { + let memory = self + .uninit_shared_memory + .remove(&id) + .ok_or_else(|| eyre!("invalid shared memory id"))?; + + // TODO figure out receivers from dataflow graph + let local_receivers = &[]; + + // send shared memory ID to all local receivers + let mut closed = Vec::new(); + for receiver_id in local_receivers { + if let Some(channel) = self.subscribe_channels.get(receiver_id) { + let input_id = DataId::from("".to_owned()); + if channel + .send_async(daemon_messages::NodeEvent::Input { + id: input_id, + metadata: Metadata::new(self.hlc.new_timestamp()), // TODO + data: unsafe { daemon_messages::InputData::new(id.clone()) }, + }) + .await + .is_err() + { + closed.push(receiver_id); + } } } - } - daemon_messages::ControlRequest::Stopped => NodeEvent::Stopped, - daemon_messages::ControlRequest::PrepareOutputMessage { output_id, len } => { - NodeEvent::PrepareOutputMessage { output_id, len } - } - daemon_messages::ControlRequest::SendOutMessage { id } => { - NodeEvent::SendOutMessage { id } - } - daemon_messages::ControlRequest::Subscribe { node_id } => { - let (tx, rx) = flume::bounded(10); - - id = Some(node_id); - enter_subscribe_loop = Some(rx); - - NodeEvent::Subscribe { event_sender: tx } - } - }; - - // send NodeEvent to daemon main loop - let (reply_tx, reply) = oneshot::channel(); - let event = Event::Node { - id: match &id { - Some(id) => id.clone(), - None => { - tracing::warn!( - "Ignoring node event because no register \ - message was sent yet: {node_event:?}" - ); - continue; + for id in closed { + self.subscribe_channels.remove(id); } - }, - event: node_event, - reply_sender: reply_tx, - }; - let Ok(()) = events_tx.send(event).await else { - break; - }; - // wait for reply and send it out - let Ok(reply) = reply.await else { - break; // main loop exited - }; - let Ok(serialized) = serde_json::to_vec(&reply) else { - tracing::error!("failed to serialize reply"); - continue; - }; - match tcp_send(&mut connection, &serialized).await { - Ok(()) => {} - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { - break; - } - Err(err) => { - tracing::error!("{err:?}"); - } - } - - // enter subscribe loop after receiving a subscribe message - if let Some(events) = enter_subscribe_loop { - subscribe_loop(connection, events).await; - break; // the subscribe loop only exits when the connection was closed - } - } -} + // TODO send `data` via network to all remove receivers + let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); -async fn subscribe_loop( - mut connection: TcpStream, - events: flume::Receiver, -) { - while let Some(event) = events.stream().next().await { - let message = match serde_json::to_vec(&event) { - Ok(m) => m, - Err(err) => { - let err = eyre!(err).wrap_err("failed to serialize node event"); - tracing::warn!("{err:?}"); - continue; - } - }; - match tcp_send(&mut connection, &message).await { - Ok(()) => {} - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { - break; + // keep shared memory ptr in order to free it once all subscribers are done + self.sent_out_shared_memory.insert(id, memory); } - Err(err) => { - tracing::error!("{err:?}"); + DaemonNodeEvent::Stopped => { + // TODO send stop message to downstream nodes + + let _ = reply_sender.send(ControlReply::Result(Ok(()))); } } + Ok(()) } } -enum Event { +pub enum Event { NewConnection(TcpStream), ConnectError(eyre::Report), Node { id: NodeId, - event: NodeEvent, + event: DaemonNodeEvent, reply_sender: oneshot::Sender, }, + Coordinator(DaemonCoordinatorEvent), } #[derive(Debug)] -pub enum NodeEvent { +pub enum DaemonNodeEvent { PrepareOutputMessage { output_id: DataId, len: usize, @@ -297,26 +233,21 @@ pub enum NodeEvent { }, } -type MessageId = String; - -async fn tcp_send(connection: &mut TcpStream, message: &[u8]) -> std::io::Result<()> { - let len_raw = (message.len() as u64).to_le_bytes(); - connection.write_all(&len_raw).await?; - connection.write_all(message).await?; - Ok(()) +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub enum DaemonCoordinatorEvent { + Spawn(SpawnCommand), } -async fn tcp_receive(connection: &mut TcpStream) -> std::io::Result> { - let reply_len = { - let mut raw = [0; 8]; - connection.read_exact(&mut raw).await?; - u64::from_le_bytes(raw) as usize - }; - let mut reply = vec![0; reply_len]; - connection.read_exact(&mut reply).await?; - Ok(reply) +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct SpawnCommand { + pub node_id: NodeId, + pub node: descriptor::CustomNode, + pub envs: Option>, + pub working_dir: PathBuf, } +type MessageId = String; + fn set_up_tracing() -> eyre::Result<()> { use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs new file mode 100644 index 00000000..849ab930 --- /dev/null +++ b/binaries/daemon/src/spawn.rs @@ -0,0 +1,78 @@ +use crate::SpawnCommand; +use dora_core::{ + daemon_messages::NodeConfig, + descriptor::{resolve_path, source_is_url}, +}; +use dora_download::download_file; +use eyre::{eyre, WrapErr}; +use std::{env::consts::EXE_EXTENSION, path::Path}; + +#[tracing::instrument] +pub async fn spawn_node( + spawn_command: SpawnCommand, + daemon_port: u16, +) -> eyre::Result>> { + let SpawnCommand { + node_id, + node, + envs, + working_dir, + } = spawn_command; + + let resolved_path = if source_is_url(&node.source) { + // try to download the shared library + let target_path = Path::new("build") + .join(node_id.to_string()) + .with_extension(EXE_EXTENSION); + download_file(&node.source, &target_path) + .await + .wrap_err("failed to download custom node")?; + target_path.clone() + } else { + resolve_path(&node.source, &working_dir) + .wrap_err_with(|| format!("failed to resolve node source `{}`", node.source))? + }; + let node_config = NodeConfig { + node_id: node_id.clone(), + run_config: node.run_config.clone(), + daemon_port, + }; + + let mut command = tokio::process::Command::new(&resolved_path); + if let Some(args) = &node.args { + command.args(args.split_ascii_whitespace()); + } + command.env( + "DORA_NODE_CONFIG", + serde_yaml::to_string(&node_config).wrap_err("failed to serialize node config")?, + ); + command.current_dir(working_dir); + + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = envs { + for (key, value) in envs { + command.env(key, value.to_string()); + } + } + + let mut child = command.spawn().wrap_err_with(move || { + format!( + "failed to run source path: `{}` with args `{}`", + resolved_path.display(), + node.args.as_deref().unwrap_or_default() + ) + })?; + let result = tokio::spawn(async move { + let status = child.wait().await.context("child process failed")?; + if status.success() { + tracing::info!("node {node_id} finished"); + Ok(()) + } else if let Some(code) = status.code() { + Err(eyre!("node {node_id} failed with exit code: {code}")) + } else { + Err(eyre!("node {node_id} failed (unknown exit code)")) + } + }); + Ok(result) +} diff --git a/binaries/daemon/src/tcp_utils.rs b/binaries/daemon/src/tcp_utils.rs new file mode 100644 index 00000000..31f5e3b5 --- /dev/null +++ b/binaries/daemon/src/tcp_utils.rs @@ -0,0 +1,22 @@ +use tokio::{ + io::{AsyncReadExt, AsyncWriteExt}, + net::TcpStream, +}; + +pub async fn tcp_send(connection: &mut TcpStream, message: &[u8]) -> std::io::Result<()> { + let len_raw = (message.len() as u64).to_le_bytes(); + connection.write_all(&len_raw).await?; + connection.write_all(message).await?; + Ok(()) +} + +pub async fn tcp_receive(connection: &mut TcpStream) -> std::io::Result> { + let reply_len = { + let mut raw = [0; 8]; + connection.read_exact(&mut raw).await?; + u64::from_le_bytes(raw) as usize + }; + let mut reply = vec![0; reply_len]; + connection.read_exact(&mut reply).await?; + Ok(reply) +} From 9a9394c204579728dc6da8d433c7a6ebc6eee8bf Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 16:34:33 +0100 Subject: [PATCH 013/225] Update Rust node API to parse new `NodeConfig` --- apis/rust/node/src/daemon.rs | 23 ++++++++-------- apis/rust/node/src/lib.rs | 51 +++++++++++++----------------------- 2 files changed, 29 insertions(+), 45 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 3229c1c0..2b79cbdd 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,12 +1,11 @@ use std::{ io::{ErrorKind, Read, Write}, - net::{Ipv4Addr, TcpStream}, + net::{Ipv4Addr, SocketAddr, TcpStream}, }; use dora_core::{ config::{DataId, NodeId}, daemon_messages::{ControlRequest, NodeEvent}, - topics::DORA_DAEMON_PORT_DEFAULT, }; use eyre::{bail, eyre, Context}; @@ -18,13 +17,13 @@ pub struct DaemonConnection { } impl DaemonConnection { - pub fn init(node_id: NodeId) -> eyre::Result { - let localhost = Ipv4Addr::new(127, 0, 0, 1); + pub fn init(node_id: &NodeId, daemon_port: u16) -> eyre::Result { + let daemon_addr = (Ipv4Addr::new(127, 0, 0, 1), daemon_port).into(); let control_stream = - init_control_stream(localhost, &node_id).wrap_err("failed to init control stream")?; + init_control_stream(daemon_addr, &node_id).wrap_err("failed to init control stream")?; let event_stream = - init_event_stream(localhost, &node_id).wrap_err("failed to init event stream")?; + init_event_stream(daemon_addr, &node_id).wrap_err("failed to init event stream")?; Ok(Self { control_channel: ControlChannel(control_stream), @@ -94,9 +93,9 @@ pub struct MessageSample { pub id: String, } -fn init_event_stream(addr: Ipv4Addr, node_id: &NodeId) -> eyre::Result { - let mut event_stream = TcpStream::connect((addr, DORA_DAEMON_PORT_DEFAULT)) - .wrap_err("failed to connect to dora-daemon")?; +fn init_event_stream(daemon_addr: SocketAddr, node_id: &NodeId) -> eyre::Result { + let mut event_stream = + TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; tcp_send( &mut event_stream, &ControlRequest::Subscribe { @@ -136,9 +135,9 @@ fn init_event_stream(addr: Ipv4Addr, node_id: &NodeId) -> eyre::Result eyre::Result { - let mut control_stream = TcpStream::connect((addr, DORA_DAEMON_PORT_DEFAULT)) - .wrap_err("failed to connect to dora-daemon")?; +fn init_control_stream(daemon_addr: SocketAddr, node_id: &NodeId) -> eyre::Result { + let mut control_stream = + TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; tcp_send( &mut control_stream, &ControlRequest::Register { diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 5f34f314..bfa1ec6d 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,6 +1,9 @@ use daemon::{ControlChannel, DaemonConnection, EventStream}; pub use dora_core; -use dora_core::config::{DataId, NodeId, NodeRunConfig}; +use dora_core::{ + config::{DataId, NodeId, NodeRunConfig}, + daemon_messages::NodeConfig, +}; pub use dora_message::{uhlc, Metadata, MetadataParameters}; use eyre::WrapErr; pub use flume::Receiver; @@ -20,28 +23,30 @@ impl DoraNode { #[cfg(feature = "tracing-subscriber")] set_up_tracing().context("failed to set up tracing subscriber")?; - let id = { - let raw = - std::env::var("DORA_NODE_ID").wrap_err("env variable DORA_NODE_ID must be set")?; - serde_yaml::from_str(&raw).context("failed to deserialize operator config")? - }; let node_config = { - let raw = std::env::var("DORA_NODE_RUN_CONFIG") - .wrap_err("env variable DORA_NODE_RUN_CONFIG must be set")?; + let raw = std::env::var("DORA_NODE_CONFIG") + .wrap_err("env variable DORA_NODE_CONFIG must be set")?; serde_yaml::from_str(&raw).context("failed to deserialize operator config")? }; - Self::init(id, node_config) + Self::init(node_config) } - pub fn init(id: NodeId, node_config: NodeRunConfig) -> eyre::Result<(Self, EventStream)> { + pub fn init(node_config: NodeConfig) -> eyre::Result<(Self, EventStream)> { + let NodeConfig { + node_id, + run_config, + daemon_port, + } = node_config; + let DaemonConnection { control_channel, event_stream, - } = DaemonConnection::init(id.clone()).wrap_err("failed to connect to dora-daemon")?; + } = DaemonConnection::init(&node_id, daemon_port) + .wrap_err("failed to connect to dora-daemon")?; let node = Self { - id, - node_config, + id: node_id, + node_config: run_config, control_channel, hlc: uhlc::HLC::default(), }; @@ -119,23 +124,3 @@ fn set_up_tracing() -> eyre::Result<()> { tracing::subscriber::set_global_default(subscriber) .context("failed to set tracing global subscriber") } - -#[cfg(test)] -mod tests { - use dora_core::config; - - use super::*; - - #[test] - fn no_op_operator() { - let id = uuid::Uuid::new_v4().to_string().into(); - let node_config = config::NodeRunConfig { - inputs: Default::default(), - outputs: Default::default(), - }; - - let (_node, events) = DoraNode::init(id, node_config).unwrap(); - - assert!(events.recv().is_err()); - } -} From e5d7ac55e21c1c78e99376b4132fed1ee4ed2da7 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 16:41:54 +0100 Subject: [PATCH 014/225] Bundle all nodes of a single machine in a single daemon spawn command Avoids synchronization issues, e.g. if one node sends outputs before the receiver was launched and registered. --- Cargo.lock | 2 ++ binaries/coordinator/src/lib.rs | 2 ++ binaries/daemon/src/main.rs | 19 ++++++++++++++----- binaries/daemon/src/spawn.rs | 8 ++++---- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 685f17ee..01cdbc55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -971,12 +971,14 @@ name = "dora-daemon" version = "0.1.0" dependencies = [ "dora-core", + "dora-download", "dora-message", "eyre", "flume", "futures-concurrency 7.0.0", "serde", "serde_json", + "serde_yaml 0.8.23", "shared_memory", "tokio", "tokio-stream", diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index a63965ae..1549f991 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -332,6 +332,8 @@ async fn start_dataflow( runtime_path: &Path, dataflow_events_tx: &Option>, ) -> eyre::Result { + // TODO: send Spawn message to daemon + let runtime_path = runtime_path.to_owned(); let dataflow_events_tx = match dataflow_events_tx { Some(channel) => channel.clone(), diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index c76251fa..f9cc57d0 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -123,11 +123,15 @@ impl Daemon { ) -> Result<(), eyre::ErrReport> { match event { DaemonCoordinatorEvent::Spawn(spawn_command) => { - let node_id = spawn_command.node_id.clone(); - let task = spawn::spawn_node(spawn_command, self.port) - .await - .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; - self.node_tasks.insert(node_id, task); + for (node_id, params) in spawn_command.nodes { + let node_id = node_id.clone(); + let task = spawn::spawn_node(params, self.port) + .await + .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; + self.node_tasks.insert(node_id, task); + } + + // TODO: spawn timers Ok(()) } } @@ -240,6 +244,11 @@ pub enum DaemonCoordinatorEvent { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub struct SpawnCommand { + pub nodes: BTreeMap, +} + +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct SpawnNodeParams { pub node_id: NodeId, pub node: descriptor::CustomNode, pub envs: Option>, diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 849ab930..6f6e6b4b 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,4 +1,4 @@ -use crate::SpawnCommand; +use crate::SpawnNodeParams; use dora_core::{ daemon_messages::NodeConfig, descriptor::{resolve_path, source_is_url}, @@ -9,15 +9,15 @@ use std::{env::consts::EXE_EXTENSION, path::Path}; #[tracing::instrument] pub async fn spawn_node( - spawn_command: SpawnCommand, + params: SpawnNodeParams, daemon_port: u16, ) -> eyre::Result>> { - let SpawnCommand { + let SpawnNodeParams { node_id, node, envs, working_dir, - } = spawn_command; + } = params; let resolved_path = if source_is_url(&node.source) { // try to download the shared library From 5ae18eae4c3e29c1caa1be22a8a282082c3fc0d0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 16:47:50 +0100 Subject: [PATCH 015/225] Include dataflow ID in daemon to avoid node ID conflicts --- binaries/daemon/src/main.rs | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index f9cc57d0..3df07bbc 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -5,7 +5,7 @@ use dora_core::{ topics::DORA_COORDINATOR_PORT_DEFAULT, }; use dora_message::{uhlc, Metadata}; -use eyre::{eyre, Context}; +use eyre::{bail, eyre, Context}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ @@ -51,7 +51,7 @@ struct Daemon { sent_out_shared_memory: HashMap, subscribe_channels: HashMap>, - node_tasks: HashMap>>, + node_tasks: HashMap>>>, } impl Daemon { @@ -120,15 +120,23 @@ impl Daemon { async fn handle_coordinator_event( &mut self, event: DaemonCoordinatorEvent, - ) -> Result<(), eyre::ErrReport> { + ) -> eyre::Result<()> { match event { - DaemonCoordinatorEvent::Spawn(spawn_command) => { - for (node_id, params) in spawn_command.nodes { + DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { + let node_tasks = match self.node_tasks.entry(dataflow_id.clone()) { + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(Default::default()) + } + std::collections::hash_map::Entry::Occupied(_) => { + bail!("there is already a running dataflow with ID `{dataflow_id}`") + } + }; + for (node_id, params) in nodes { let node_id = node_id.clone(); let task = spawn::spawn_node(params, self.port) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; - self.node_tasks.insert(node_id, task); + node_tasks.insert(node_id, task); } // TODO: spawn timers @@ -239,11 +247,14 @@ pub enum DaemonNodeEvent { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorEvent { - Spawn(SpawnCommand), + Spawn(SpawnDataflowNodes), } +type DataflowId = String; + #[derive(Debug, serde::Deserialize, serde::Serialize)] -pub struct SpawnCommand { +pub struct SpawnDataflowNodes { + pub dataflow_id: DataflowId, pub nodes: BTreeMap, } From cc1dc971edf078b1c4fa2e68d1160ed7f74eb650 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 9 Dec 2022 17:24:59 +0100 Subject: [PATCH 016/225] Update dora-coordinator to start dataflows through dora-daemon --- binaries/coordinator/src/lib.rs | 8 ++- binaries/coordinator/src/run/custom.rs | 96 -------------------------- binaries/coordinator/src/run/mod.rs | 89 ++++++++++++++---------- binaries/daemon/src/main.rs | 29 +------- binaries/daemon/src/spawn.rs | 6 +- libraries/core/src/daemon_messages.rs | 28 +++++++- libraries/core/src/descriptor/mod.rs | 2 +- 7 files changed, 91 insertions(+), 167 deletions(-) delete mode 100644 binaries/coordinator/src/run/custom.rs diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 1549f991..0143b5f8 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -48,10 +48,12 @@ pub async fn run(args: Args) -> eyre::Result<()> { .with_file_name("dora-runtime") }); + let daemon_connections = &mut HashMap::new(); // TODO + match run_dataflow { Some(path) => { // start the given dataflow directly - run::run_dataflow(&path, &runtime_path) + run::run_dataflow(&path, &runtime_path, daemon_connections) .await .wrap_err_with(|| format!("failed to run dataflow at {}", path.display()))?; } @@ -168,6 +170,7 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { name, runtime_path, &dataflow_events_tx, + &mut daemon_connections, ) .await?; Ok(dataflow) @@ -331,6 +334,7 @@ async fn start_dataflow( name: Option, runtime_path: &Path, dataflow_events_tx: &Option>, + daemon_connections: &mut HashMap, ) -> eyre::Result { // TODO: send Spawn message to daemon @@ -343,7 +347,7 @@ async fn start_dataflow( uuid, communication_config, tasks, - } = spawn_dataflow(&runtime_path, path).await?; + } = spawn_dataflow(&runtime_path, path, daemon_connections).await?; let path = path.to_owned(); let task = async move { let result = await_tasks(tasks) diff --git a/binaries/coordinator/src/run/custom.rs b/binaries/coordinator/src/run/custom.rs deleted file mode 100644 index cc29ab01..00000000 --- a/binaries/coordinator/src/run/custom.rs +++ /dev/null @@ -1,96 +0,0 @@ -use super::command_init_common_env; -use dora_core::{ - config::NodeId, - descriptor::{self, resolve_path, source_is_url, EnvValue}, -}; -use dora_download::download_file; -use eyre::{bail, eyre, WrapErr}; -use std::{collections::BTreeMap, env::consts::EXE_EXTENSION, path::Path}; - -const SHELL_SOURCE: &str = "shell"; - -#[tracing::instrument] -pub(super) async fn spawn_custom_node( - node_id: NodeId, - node: &descriptor::CustomNode, - envs: &Option>, - communication: &dora_core::config::CommunicationConfig, - working_dir: &Path, -) -> eyre::Result>> { - let resolved_path = if source_is_url(&node.source) { - // try to download the shared library - let target_path = Path::new("build") - .join(node_id.to_string()) - .with_extension(EXE_EXTENSION); - download_file(&node.source, &target_path) - .await - .wrap_err("failed to download custom node")?; - Ok(target_path.clone()) - } else { - resolve_path(&node.source, working_dir) - }; - - let mut command = if let Ok(path) = &resolved_path { - let mut command = tokio::process::Command::new(path); - if let Some(args) = &node.args { - command.args(args.split_ascii_whitespace()); - } - command - } else if node.source == SHELL_SOURCE { - if cfg!(target_os = "windows") { - let mut cmd = tokio::process::Command::new("cmd"); - cmd.args(["/C", &node.args.clone().unwrap_or_default()]); - cmd - } else { - let mut cmd = tokio::process::Command::new("sh"); - cmd.args(["-c", &node.args.clone().unwrap_or_default()]); - cmd - } - } else { - bail!("could not understand node source: {}", node.source); - }; - - command_init_common_env(&mut command, &node_id, communication)?; - command.env( - "DORA_NODE_RUN_CONFIG", - serde_yaml::to_string(&node.run_config) - .wrap_err("failed to serialize custom node run config")?, - ); - command.current_dir(working_dir); - - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = envs { - for (key, value) in envs { - command.env(key, value.to_string()); - } - } - - let mut child = command.spawn().wrap_err_with(|| { - if let Ok(path) = resolved_path { - format!( - "failed to run source path: `{}` with args `{}`", - path.display(), - node.args.as_deref().unwrap_or_default() - ) - } else { - format!( - "failed to run command: `{}` with args `{}`", - node.source, - node.args.as_deref().unwrap_or_default() - ) - } - })?; - let result = tokio::spawn(async move { - let status = child.wait().await.context("child process failed")?; - if status.success() { - tracing::info!("node {node_id} finished"); - Ok(()) - } else if let Some(code) = status.code() { - Err(eyre!("node {node_id} failed with exit code: {code}")) - } else { - Err(eyre!("node {node_id} failed (unknown exit code)")) - } - }); - Ok(result) -} diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 17b3c902..e7ef3022 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -1,23 +1,40 @@ -use self::{custom::spawn_custom_node, runtime::spawn_runtime_node}; +use crate::tcp_utils::tcp_send; + +use self::runtime::spawn_runtime_node; use dora_core::{ config::{format_duration, CommunicationConfig, NodeId}, + daemon_messages::{DaemonCoordinatorEvent, SpawnDataflowNodes, SpawnNodeParams}, descriptor::{self, collect_dora_timers, CoreNodeKind, Descriptor}, }; -use eyre::{bail, eyre, WrapErr}; +use eyre::{bail, eyre, ContextCompat, WrapErr}; use futures::{stream::FuturesUnordered, StreamExt}; -use std::{env::consts::EXE_EXTENSION, path::Path}; +use std::{ + collections::{BTreeMap, HashMap}, + env::consts::EXE_EXTENSION, + path::Path, +}; +use tokio::net::TcpStream; use tokio_stream::wrappers::IntervalStream; use uuid::Uuid; -mod custom; mod runtime; -pub async fn run_dataflow(dataflow_path: &Path, runtime: &Path) -> eyre::Result<()> { - let tasks = spawn_dataflow(runtime, dataflow_path).await?.tasks; +pub async fn run_dataflow( + dataflow_path: &Path, + runtime: &Path, + daemon_connections: &mut HashMap, +) -> eyre::Result<()> { + let tasks = spawn_dataflow(runtime, dataflow_path, daemon_connections) + .await? + .tasks; await_tasks(tasks).await } -pub async fn spawn_dataflow(runtime: &Path, dataflow_path: &Path) -> eyre::Result { +pub async fn spawn_dataflow( + runtime: &Path, + dataflow_path: &Path, + daemon_connections: &mut HashMap, +) -> eyre::Result { let mut runtime = runtime.with_extension(EXE_EXTENSION); let descriptor = read_descriptor(dataflow_path).await.wrap_err_with(|| { format!( @@ -57,39 +74,37 @@ pub async fn spawn_dataflow(runtime: &Path, dataflow_path: &Path) -> eyre::Resul } } } - let tasks = FuturesUnordered::new(); - for node in nodes { - let node_id = node.id.clone(); + let mut custom_nodes = BTreeMap::new(); + for node in nodes { match node.kind { - descriptor::CoreNodeKind::Custom(custom) => { - let result = spawn_custom_node( - node_id.clone(), - &custom, - &node.env, - &communication_config, - &working_dir, - ) - .await - .wrap_err_with(|| format!("failed to spawn custom node {node_id}"))?; - tasks.push(result); - } - descriptor::CoreNodeKind::Runtime(runtime_node) => { - if !runtime_node.operators.is_empty() { - let result = spawn_runtime_node( - &runtime, - node_id.clone(), - &runtime_node, - &node.env, - &communication_config, - &working_dir, - ) - .wrap_err_with(|| format!("failed to spawn runtime node {node_id}"))?; - tasks.push(result); - } + CoreNodeKind::Runtime(_) => todo!(), + CoreNodeKind::Custom(n) => { + custom_nodes.insert( + node.id.clone(), + SpawnNodeParams { + node_id: node.id, + node: n, + working_dir: working_dir.clone(), + }, + ); } } } + + let spawn_command = SpawnDataflowNodes { + dataflow_id: uuid, + nodes: custom_nodes, + }; + let message = serde_json::to_vec(&DaemonCoordinatorEvent::Spawn(spawn_command))?; + let daemon_connection = daemon_connections + .get_mut("") + .wrap_err("no daemon connection")?; // TODO: take from dataflow spec + tcp_send(daemon_connection, &message) + .await + .wrap_err("failed to send spawn message to daemon")?; + + // TODO for interval in dora_timers { let communication_config = communication_config.clone(); // let mut communication = @@ -112,12 +127,12 @@ pub async fn spawn_dataflow(runtime: &Path, dataflow_path: &Path) -> eyre::Resul // .unwrap() // .publish(&data) // .expect("failed to publish timer tick message"); - todo!() + // todo!() } }); } Ok(SpawnedDataflow { - tasks, + tasks: FuturesUnordered::new(), // TODO communication_config, uuid, }) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 3df07bbc..b8e62cd6 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,7 +1,6 @@ use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{self, ControlReply}, - descriptor, + daemon_messages::{self, ControlReply, DaemonCoordinatorEvent, DataflowId, SpawnDataflowNodes}, topics::DORA_COORDINATOR_PORT_DEFAULT, }; use dora_message::{uhlc, Metadata}; @@ -9,9 +8,8 @@ use eyre::{bail, eyre, Context}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ - collections::{BTreeMap, HashMap}, + collections::HashMap, net::{Ipv4Addr, SocketAddr}, - path::PathBuf, }; use tokio::{ net::TcpStream, @@ -123,7 +121,7 @@ impl Daemon { ) -> eyre::Result<()> { match event { DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { - let node_tasks = match self.node_tasks.entry(dataflow_id.clone()) { + let node_tasks = match self.node_tasks.entry(dataflow_id) { std::collections::hash_map::Entry::Vacant(entry) => { entry.insert(Default::default()) } @@ -245,27 +243,6 @@ pub enum DaemonNodeEvent { }, } -#[derive(Debug, serde::Deserialize, serde::Serialize)] -pub enum DaemonCoordinatorEvent { - Spawn(SpawnDataflowNodes), -} - -type DataflowId = String; - -#[derive(Debug, serde::Deserialize, serde::Serialize)] -pub struct SpawnDataflowNodes { - pub dataflow_id: DataflowId, - pub nodes: BTreeMap, -} - -#[derive(Debug, serde::Deserialize, serde::Serialize)] -pub struct SpawnNodeParams { - pub node_id: NodeId, - pub node: descriptor::CustomNode, - pub envs: Option>, - pub working_dir: PathBuf, -} - type MessageId = String; fn set_up_tracing() -> eyre::Result<()> { diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 6f6e6b4b..832ee2a0 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,6 +1,5 @@ -use crate::SpawnNodeParams; use dora_core::{ - daemon_messages::NodeConfig, + daemon_messages::{NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, }; use dora_download::download_file; @@ -15,7 +14,6 @@ pub async fn spawn_node( let SpawnNodeParams { node_id, node, - envs, working_dir, } = params; @@ -50,7 +48,7 @@ pub async fn spawn_node( // Injecting the env variable defined in the `yaml` into // the node runtime. - if let Some(envs) = envs { + if let Some(envs) = node.envs { for (key, value) in envs { command.env(key, value.to_string()); } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index df3a849a..526e97f8 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -1,7 +1,13 @@ -use crate::config::{DataId, NodeId, NodeRunConfig}; +use std::{collections::BTreeMap, path::PathBuf}; + +use crate::{ + config::{DataId, NodeId, NodeRunConfig}, + descriptor, +}; use dora_message::Metadata; use eyre::Context; use shared_memory::{Shmem, ShmemConf}; +use uuid::Uuid; #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct NodeConfig { @@ -67,3 +73,23 @@ impl std::ops::Deref for MappedInputData { unsafe { self.memory.as_slice() } } } + +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub enum DaemonCoordinatorEvent { + Spawn(SpawnDataflowNodes), +} + +pub type DataflowId = Uuid; + +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct SpawnDataflowNodes { + pub dataflow_id: DataflowId, + pub nodes: BTreeMap, +} + +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct SpawnNodeParams { + pub node_id: NodeId, + pub node: descriptor::CustomNode, + pub working_dir: PathBuf, +} diff --git a/libraries/core/src/descriptor/mod.rs b/libraries/core/src/descriptor/mod.rs index d3ed0459..53f614a9 100644 --- a/libraries/core/src/descriptor/mod.rs +++ b/libraries/core/src/descriptor/mod.rs @@ -211,7 +211,7 @@ pub struct CustomNode { pub source: String, #[serde(default, skip_serializing_if = "Option::is_none")] pub args: Option, - pub working_directory: Option>, + pub envs: Option>, #[serde(default, skip_serializing_if = "Option::is_none")] pub build: Option, From 0ae5fc2ffa257f86d8e6a431c8d9163b0f5bc32b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 10:59:46 +0100 Subject: [PATCH 017/225] Add dataflow ID to daemon messages --- Cargo.lock | 1 + apis/rust/node/src/daemon.rs | 26 +++++++++----- apis/rust/node/src/lib.rs | 3 +- binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/listener.rs | 37 +++++++++++-------- binaries/daemon/src/main.rs | 51 +++++++++++++++++++-------- binaries/daemon/src/spawn.rs | 4 ++- libraries/core/src/daemon_messages.rs | 20 ++++++++--- 8 files changed, 100 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 618c71f0..3d187199 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -984,6 +984,7 @@ dependencies = [ "tokio-stream", "tracing", "tracing-subscriber", + "uuid 1.2.1", ] [[package]] diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 2b79cbdd..7735d0b4 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -5,7 +5,7 @@ use std::{ use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{ControlRequest, NodeEvent}, + daemon_messages::{ControlRequest, DataflowId, NodeEvent}, }; use eyre::{bail, eyre, Context}; @@ -17,13 +17,13 @@ pub struct DaemonConnection { } impl DaemonConnection { - pub fn init(node_id: &NodeId, daemon_port: u16) -> eyre::Result { + pub fn init(dataflow_id: DataflowId, node_id: &NodeId, daemon_port: u16) -> eyre::Result { let daemon_addr = (Ipv4Addr::new(127, 0, 0, 1), daemon_port).into(); - let control_stream = - init_control_stream(daemon_addr, &node_id).wrap_err("failed to init control stream")?; + let control_stream = init_control_stream(daemon_addr, dataflow_id, &node_id) + .wrap_err("failed to init control stream")?; - let event_stream = - init_event_stream(daemon_addr, &node_id).wrap_err("failed to init event stream")?; + let event_stream = init_event_stream(daemon_addr, dataflow_id, &node_id) + .wrap_err("failed to init event stream")?; Ok(Self { control_channel: ControlChannel(control_stream), @@ -93,12 +93,17 @@ pub struct MessageSample { pub id: String, } -fn init_event_stream(daemon_addr: SocketAddr, node_id: &NodeId) -> eyre::Result { +fn init_event_stream( + daemon_addr: SocketAddr, + dataflow_id: DataflowId, + node_id: &NodeId, +) -> eyre::Result { let mut event_stream = TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; tcp_send( &mut event_stream, &ControlRequest::Subscribe { + dataflow_id, node_id: node_id.clone(), }, ) @@ -135,12 +140,17 @@ fn init_event_stream(daemon_addr: SocketAddr, node_id: &NodeId) -> eyre::Result< Ok(rx) } -fn init_control_stream(daemon_addr: SocketAddr, node_id: &NodeId) -> eyre::Result { +fn init_control_stream( + daemon_addr: SocketAddr, + dataflow_id: DataflowId, + node_id: &NodeId, +) -> eyre::Result { let mut control_stream = TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; tcp_send( &mut control_stream, &ControlRequest::Register { + dataflow_id, node_id: node_id.clone(), }, ) diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index bfa1ec6d..65825a49 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -33,6 +33,7 @@ impl DoraNode { pub fn init(node_config: NodeConfig) -> eyre::Result<(Self, EventStream)> { let NodeConfig { + dataflow_id, node_id, run_config, daemon_port, @@ -41,7 +42,7 @@ impl DoraNode { let DaemonConnection { control_channel, event_stream, - } = DaemonConnection::init(&node_id, daemon_port) + } = DaemonConnection::init(dataflow_id, &node_id, daemon_port) .wrap_err("failed to connect to dora-daemon")?; let node = Self { diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 530c7a09..55f6a27d 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -20,3 +20,4 @@ dora-message = { path = "../../libraries/message" } flume = "0.10.14" dora-download = { path = "../../libraries/extensions/download" } serde_yaml = "0.8.23" +uuid = { version = "1.1.2", features = ["v4"] } diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index eb42399d..c51d49ce 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -48,8 +48,11 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende // handle the message and translate it to a NodeEvent let node_event = match message { - daemon_messages::ControlRequest::Register { node_id } => { - id = Some(node_id); + daemon_messages::ControlRequest::Register { + dataflow_id, + node_id, + } => { + id = Some((dataflow_id, node_id)); let reply = daemon_messages::ControlReply::Result(Ok(())); let serialized = serde_json::to_vec(&reply) @@ -71,29 +74,35 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende daemon_messages::ControlRequest::SendOutMessage { id } => { DaemonNodeEvent::SendOutMessage { id } } - daemon_messages::ControlRequest::Subscribe { node_id } => { + daemon_messages::ControlRequest::Subscribe { + dataflow_id, + node_id, + } => { let (tx, rx) = flume::bounded(10); - id = Some(node_id); + id = Some((dataflow_id, node_id)); enter_subscribe_loop = Some(rx); DaemonNodeEvent::Subscribe { event_sender: tx } } }; + let (dataflow_id, node_id) = match &id { + Some(id) => id.clone(), + None => { + tracing::warn!( + "Ignoring node event because no register \ + message was sent yet: {node_event:?}" + ); + continue; + } + }; + // send NodeEvent to daemon main loop let (reply_tx, reply) = oneshot::channel(); let event = Event::Node { - id: match &id { - Some(id) => id.clone(), - None => { - tracing::warn!( - "Ignoring node event because no register \ - message was sent yet: {node_event:?}" - ); - continue; - } - }, + dataflow_id, + node_id, event: node_event, reply_sender: reply_tx, }; diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index b8e62cd6..880cbea5 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -4,7 +4,7 @@ use dora_core::{ topics::DORA_COORDINATOR_PORT_DEFAULT, }; use dora_message::{uhlc, Metadata}; -use eyre::{bail, eyre, Context}; +use eyre::{bail, eyre, Context, ContextCompat}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ @@ -47,9 +47,8 @@ struct Daemon { hlc: uhlc::HLC, uninit_shared_memory: HashMap, sent_out_shared_memory: HashMap, - subscribe_channels: HashMap>, - node_tasks: HashMap>>>, + running: HashMap, } impl Daemon { @@ -78,8 +77,7 @@ impl Daemon { hlc: uhlc::HLC::default(), uninit_shared_memory: Default::default(), sent_out_shared_memory: Default::default(), - subscribe_channels: Default::default(), - node_tasks: HashMap::new(), + running: HashMap::new(), }; let events = (coordinator_events, new_connections).merge(); daemon.run_inner(events).await @@ -105,10 +103,14 @@ impl Daemon { } Event::Coordinator(event) => self.handle_coordinator_event(event).await?, Event::Node { - id, + dataflow_id: dataflow, + node_id, event, reply_sender, - } => self.handle_node_event(event, id, reply_sender).await?, + } => { + self.handle_node_event(event, dataflow, node_id, reply_sender) + .await? + } } } @@ -121,7 +123,7 @@ impl Daemon { ) -> eyre::Result<()> { match event { DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { - let node_tasks = match self.node_tasks.entry(dataflow_id) { + let dataflow = match self.running.entry(dataflow_id) { std::collections::hash_map::Entry::Vacant(entry) => { entry.insert(Default::default()) } @@ -131,10 +133,10 @@ impl Daemon { }; for (node_id, params) in nodes { let node_id = node_id.clone(); - let task = spawn::spawn_node(params, self.port) + let task = spawn::spawn_node(dataflow_id, params, self.port) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; - node_tasks.insert(node_id, task); + dataflow.node_tasks.insert(node_id, task); } // TODO: spawn timers @@ -146,13 +148,20 @@ impl Daemon { async fn handle_node_event( &mut self, event: DaemonNodeEvent, + dataflow: DataflowId, id: NodeId, reply_sender: oneshot::Sender, ) -> Result<(), eyre::ErrReport> { match event { DaemonNodeEvent::Subscribe { event_sender } => { - self.subscribe_channels.insert(id, event_sender); - let _ = reply_sender.send(ControlReply::Result(Ok(()))); + let result = match self.running.get_mut(&dataflow) { + Some(dataflow) => { + dataflow.subscribe_channels.insert(id, event_sender); + Ok(()) + } + None => Err(format!("no running dataflow with ID `{dataflow}`")), + }; + let _ = reply_sender.send(ControlReply::Result(result)); } DaemonNodeEvent::PrepareOutputMessage { output_id, len } => { let memory = ShmemConf::new() @@ -176,13 +185,18 @@ impl Daemon { .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; + let dataflow = self + .running + .get_mut(&dataflow) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow}`"))?; + // TODO figure out receivers from dataflow graph let local_receivers = &[]; // send shared memory ID to all local receivers let mut closed = Vec::new(); for receiver_id in local_receivers { - if let Some(channel) = self.subscribe_channels.get(receiver_id) { + if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { let input_id = DataId::from("".to_owned()); if channel .send_async(daemon_messages::NodeEvent::Input { @@ -198,7 +212,7 @@ impl Daemon { } } for id in closed { - self.subscribe_channels.remove(id); + dataflow.subscribe_channels.remove(id); } // TODO send `data` via network to all remove receivers @@ -217,11 +231,18 @@ impl Daemon { } } +#[derive(Default)] +pub struct RunningDataflow { + subscribe_channels: HashMap>, + node_tasks: HashMap>>, +} + pub enum Event { NewConnection(TcpStream), ConnectError(eyre::Report), Node { - id: NodeId, + dataflow_id: DataflowId, + node_id: NodeId, event: DaemonNodeEvent, reply_sender: oneshot::Sender, }, diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 832ee2a0..5fcc8237 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,5 +1,5 @@ use dora_core::{ - daemon_messages::{NodeConfig, SpawnNodeParams}, + daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, }; use dora_download::download_file; @@ -8,6 +8,7 @@ use std::{env::consts::EXE_EXTENSION, path::Path}; #[tracing::instrument] pub async fn spawn_node( + dataflow_id: DataflowId, params: SpawnNodeParams, daemon_port: u16, ) -> eyre::Result>> { @@ -31,6 +32,7 @@ pub async fn spawn_node( .wrap_err_with(|| format!("failed to resolve node source `{}`", node.source))? }; let node_config = NodeConfig { + dataflow_id, node_id: node_id.clone(), run_config: node.run_config.clone(), daemon_port, diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 526e97f8..41f82977 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -11,6 +11,7 @@ use uuid::Uuid; #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct NodeConfig { + pub dataflow_id: DataflowId, pub node_id: NodeId, pub run_config: NodeRunConfig, pub daemon_port: u16, @@ -18,10 +19,21 @@ pub struct NodeConfig { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum ControlRequest { - Register { node_id: NodeId }, - Subscribe { node_id: NodeId }, - PrepareOutputMessage { output_id: DataId, len: usize }, - SendOutMessage { id: SharedMemoryId }, + Register { + dataflow_id: DataflowId, + node_id: NodeId, + }, + Subscribe { + dataflow_id: DataflowId, + node_id: NodeId, + }, + PrepareOutputMessage { + output_id: DataId, + len: usize, + }, + SendOutMessage { + id: SharedMemoryId, + }, Stopped, } From a0d0b21c1b7538c9d260061398c6af9fc0e39dea Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 11:35:47 +0100 Subject: [PATCH 018/225] Map outputs to inputs of local nodes --- binaries/daemon/src/main.rs | 55 +++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 880cbea5..77f68bd4 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,5 +1,5 @@ use dora_core::{ - config::{DataId, NodeId}, + config::{DataId, InputMapping, NodeId}, daemon_messages::{self, ControlReply, DaemonCoordinatorEvent, DataflowId, SpawnDataflowNodes}, topics::DORA_COORDINATOR_PORT_DEFAULT, }; @@ -8,7 +8,7 @@ use eyre::{bail, eyre, Context, ContextCompat}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ - collections::HashMap, + collections::{BTreeSet, HashMap}, net::{Ipv4Addr, SocketAddr}, }; use tokio::{ @@ -45,7 +45,7 @@ async fn run() -> eyre::Result<()> { struct Daemon { port: u16, hlc: uhlc::HLC, - uninit_shared_memory: HashMap, + uninit_shared_memory: HashMap, sent_out_shared_memory: HashMap, running: HashMap, @@ -132,7 +132,16 @@ impl Daemon { } }; for (node_id, params) in nodes { - let node_id = node_id.clone(); + for (input_id, mapping) in params.node.run_config.inputs.clone() { + if let InputMapping::User(mapping) = mapping { + dataflow + .mappings + .entry((mapping.source, mapping.output)) + .or_default() + .insert((node_id.clone(), input_id)); + } + } + let task = spawn::spawn_node(dataflow_id, params, self.port) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; @@ -148,18 +157,18 @@ impl Daemon { async fn handle_node_event( &mut self, event: DaemonNodeEvent, - dataflow: DataflowId, - id: NodeId, + dataflow_id: DataflowId, + node_id: NodeId, reply_sender: oneshot::Sender, ) -> Result<(), eyre::ErrReport> { match event { DaemonNodeEvent::Subscribe { event_sender } => { - let result = match self.running.get_mut(&dataflow) { + let result = match self.running.get_mut(&dataflow_id) { Some(dataflow) => { - dataflow.subscribe_channels.insert(id, event_sender); + dataflow.subscribe_channels.insert(node_id, event_sender); Ok(()) } - None => Err(format!("no running dataflow with ID `{dataflow}`")), + None => Err(format!("no running dataflow with ID `{dataflow_id}`")), }; let _ = reply_sender.send(ControlReply::Result(result)); } @@ -169,7 +178,8 @@ impl Daemon { .create() .wrap_err("failed to allocate shared memory")?; let id = memory.get_os_id().to_owned(); - self.uninit_shared_memory.insert(id.clone(), memory); + self.uninit_shared_memory + .insert(id.clone(), (output_id, memory)); let reply = ControlReply::PreparedMessage { shared_memory_id: id.clone(), @@ -180,27 +190,30 @@ impl Daemon { } } DaemonNodeEvent::SendOutMessage { id } => { - let memory = self + let (output_id, memory) = self .uninit_shared_memory .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; let dataflow = self .running - .get_mut(&dataflow) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow}`"))?; + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - // TODO figure out receivers from dataflow graph - let local_receivers = &[]; + // figure out receivers from dataflow graph + let empty_set = BTreeSet::new(); + let local_receivers = dataflow + .mappings + .get(&(node_id, output_id)) + .unwrap_or(&empty_set); // send shared memory ID to all local receivers let mut closed = Vec::new(); - for receiver_id in local_receivers { + for (receiver_id, input_id) in local_receivers { if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { - let input_id = DataId::from("".to_owned()); if channel .send_async(daemon_messages::NodeEvent::Input { - id: input_id, + id: input_id.clone(), metadata: Metadata::new(self.hlc.new_timestamp()), // TODO data: unsafe { daemon_messages::InputData::new(id.clone()) }, }) @@ -220,6 +233,8 @@ impl Daemon { // keep shared memory ptr in order to free it once all subscribers are done self.sent_out_shared_memory.insert(id, memory); + + let _ = reply_sender.send(ControlReply::Result(Ok(()))); } DaemonNodeEvent::Stopped => { // TODO send stop message to downstream nodes @@ -235,8 +250,12 @@ impl Daemon { pub struct RunningDataflow { subscribe_channels: HashMap>, node_tasks: HashMap>>, + mappings: HashMap>, } +type OutputId = (NodeId, DataId); +type InputId = (NodeId, DataId); + pub enum Event { NewConnection(TcpStream), ConnectError(eyre::Report), From 1724c86af81db7c9505cd55dec8df16144294939 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 12:08:33 +0100 Subject: [PATCH 019/225] Also treat `BrokenPipe` error as stopped connection --- binaries/daemon/src/listener.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index c51d49ce..9dd4ba0d 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -151,7 +151,10 @@ async fn subscribe_loop( }; match tcp_send(&mut connection, &message).await { Ok(()) => {} - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { + Err(err) + if err.kind() == ErrorKind::UnexpectedEof + || err.kind() == ErrorKind::BrokenPipe => + { break; } Err(err) => { From ce4e5a16163bef75c7a553ff2dc54064ef92bd59 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 12:09:16 +0100 Subject: [PATCH 020/225] Send metadata through TCP for now --- apis/rust/node/src/daemon.rs | 9 +++++++-- apis/rust/node/src/lib.rs | 11 +++-------- binaries/daemon/src/listener.rs | 12 +++++++++--- binaries/daemon/src/main.rs | 22 ++++++++++++---------- libraries/core/src/daemon_messages.rs | 3 ++- libraries/message/src/lib.rs | 2 +- 6 files changed, 34 insertions(+), 25 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 7735d0b4..975a1d3c 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -52,11 +52,16 @@ impl ControlChannel { pub fn prepare_message( &mut self, output_id: DataId, - len: usize, + metadata: dora_message::Metadata<'static>, + data_len: usize, ) -> eyre::Result { tcp_send( &mut self.0, - &ControlRequest::PrepareOutputMessage { output_id, len }, + &ControlRequest::PrepareOutputMessage { + output_id, + metadata, + data_len, + }, ) .wrap_err("failed to send PrepareOutputMessage request to dora-daemon")?; match tcp_receive(&mut self.0) diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 65825a49..f12e6a70 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -67,15 +67,11 @@ impl DoraNode { if !self.node_config.outputs.contains(&output_id) { eyre::bail!("unknown output"); } - let metadata = Metadata::from_parameters(self.hlc.new_timestamp(), parameters); - let serialized_metadata = metadata - .serialize() - .with_context(|| format!("failed to serialize `{}` message", output_id))?; - let full_len = serialized_metadata.len() + data_len; + let metadata = Metadata::from_parameters(self.hlc.new_timestamp(), parameters.into_owned()); let sample = self .control_channel - .prepare_message(output_id.clone(), full_len) + .prepare_message(output_id.clone(), metadata, data_len) .wrap_err("failed to prepare sample for output message")?; // map shared memory and fill in data @@ -86,8 +82,7 @@ impl DoraNode { .wrap_err("failed to open shared memory sample")?; let raw = unsafe { shared_memory.as_slice_mut() }; - raw[..serialized_metadata.len()].copy_from_slice(&serialized_metadata); - data(&mut raw[serialized_metadata.len()..]); + data(raw); } self.control_channel diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 9dd4ba0d..de1b18a8 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -68,9 +68,15 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende } } daemon_messages::ControlRequest::Stopped => DaemonNodeEvent::Stopped, - daemon_messages::ControlRequest::PrepareOutputMessage { output_id, len } => { - DaemonNodeEvent::PrepareOutputMessage { output_id, len } - } + daemon_messages::ControlRequest::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => DaemonNodeEvent::PrepareOutputMessage { + output_id, + metadata, + data_len, + }, daemon_messages::ControlRequest::SendOutMessage { id } => { DaemonNodeEvent::SendOutMessage { id } } diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 77f68bd4..0fd9f4fd 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -3,7 +3,6 @@ use dora_core::{ daemon_messages::{self, ControlReply, DaemonCoordinatorEvent, DataflowId, SpawnDataflowNodes}, topics::DORA_COORDINATOR_PORT_DEFAULT, }; -use dora_message::{uhlc, Metadata}; use eyre::{bail, eyre, Context, ContextCompat}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; @@ -44,8 +43,7 @@ async fn run() -> eyre::Result<()> { struct Daemon { port: u16, - hlc: uhlc::HLC, - uninit_shared_memory: HashMap, + uninit_shared_memory: HashMap, Shmem)>, sent_out_shared_memory: HashMap, running: HashMap, @@ -74,7 +72,6 @@ impl Daemon { let daemon = Self { port, - hlc: uhlc::HLC::default(), uninit_shared_memory: Default::default(), sent_out_shared_memory: Default::default(), running: HashMap::new(), @@ -172,14 +169,18 @@ impl Daemon { }; let _ = reply_sender.send(ControlReply::Result(result)); } - DaemonNodeEvent::PrepareOutputMessage { output_id, len } => { + DaemonNodeEvent::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => { let memory = ShmemConf::new() - .size(len) + .size(data_len) .create() .wrap_err("failed to allocate shared memory")?; let id = memory.get_os_id().to_owned(); self.uninit_shared_memory - .insert(id.clone(), (output_id, memory)); + .insert(id.clone(), (output_id, metadata, memory)); let reply = ControlReply::PreparedMessage { shared_memory_id: id.clone(), @@ -190,7 +191,7 @@ impl Daemon { } } DaemonNodeEvent::SendOutMessage { id } => { - let (output_id, memory) = self + let (output_id, metadata, memory) = self .uninit_shared_memory .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; @@ -214,7 +215,7 @@ impl Daemon { if channel .send_async(daemon_messages::NodeEvent::Input { id: input_id.clone(), - metadata: Metadata::new(self.hlc.new_timestamp()), // TODO + metadata: metadata.clone(), data: unsafe { daemon_messages::InputData::new(id.clone()) }, }) .await @@ -272,7 +273,8 @@ pub enum Event { pub enum DaemonNodeEvent { PrepareOutputMessage { output_id: DataId, - len: usize, + metadata: dora_message::Metadata<'static>, + data_len: usize, }, SendOutMessage { id: MessageId, diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 41f82977..7a1eaf4a 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -29,7 +29,8 @@ pub enum ControlRequest { }, PrepareOutputMessage { output_id: DataId, - len: usize, + metadata: Metadata<'static>, + data_len: usize, }, SendOutMessage { id: SharedMemoryId, diff --git a/libraries/message/src/lib.rs b/libraries/message/src/lib.rs index f0bc6c8c..891e854f 100644 --- a/libraries/message/src/lib.rs +++ b/libraries/message/src/lib.rs @@ -22,7 +22,7 @@ pub struct MetadataParameters<'a> { } impl MetadataParameters<'_> { - fn into_owned(self) -> MetadataParameters<'static> { + pub fn into_owned(self) -> MetadataParameters<'static> { MetadataParameters { open_telemetry_context: self.open_telemetry_context.into_owned().into(), ..self From 672fb4d0b4af023ef2d859a0c41bdf16737a253b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 13:15:53 +0100 Subject: [PATCH 021/225] Close connections on errors --- binaries/daemon/src/listener.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index de1b18a8..f3b0d3dc 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -159,12 +159,14 @@ async fn subscribe_loop( Ok(()) => {} Err(err) if err.kind() == ErrorKind::UnexpectedEof - || err.kind() == ErrorKind::BrokenPipe => + || err.kind() == ErrorKind::BrokenPipe + || err.kind() == ErrorKind::ConnectionReset => { break; } Err(err) => { tracing::error!("{err:?}"); + break; } } } From a10fd0b1cbc71c5478a0f0ae5dfb5779b7b8d114 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 13:16:38 +0100 Subject: [PATCH 022/225] Implement timer messages --- binaries/daemon/src/main.rs | 118 ++++++++++++++++++++++-- examples/rust-dataflow/sink/src/main.rs | 4 +- libraries/core/src/daemon_messages.rs | 2 +- 3 files changed, 111 insertions(+), 13 deletions(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 0fd9f4fd..1352718a 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -3,12 +3,14 @@ use dora_core::{ daemon_messages::{self, ControlReply, DaemonCoordinatorEvent, DataflowId, SpawnDataflowNodes}, topics::DORA_COORDINATOR_PORT_DEFAULT, }; +use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ - collections::{BTreeSet, HashMap}, + collections::{BTreeMap, BTreeSet, HashMap}, net::{Ipv4Addr, SocketAddr}, + time::Duration, }; use tokio::{ net::TcpStream, @@ -47,6 +49,8 @@ struct Daemon { sent_out_shared_memory: HashMap, running: HashMap, + + dora_events_tx: mpsc::Sender, } impl Daemon { @@ -70,13 +74,16 @@ impl Daemon { }); tracing::info!("Listening for node connections on 127.0.0.1:{port}"); + let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let daemon = Self { port, uninit_shared_memory: Default::default(), sent_out_shared_memory: Default::default(), running: HashMap::new(), + dora_events_tx, }; - let events = (coordinator_events, new_connections).merge(); + let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); + let events = (coordinator_events, new_connections, dora_events).merge(); daemon.run_inner(events).await } @@ -108,6 +115,7 @@ impl Daemon { self.handle_node_event(event, dataflow, node_id, reply_sender) .await? } + Event::Dora(event) => self.handle_dora_event(event).await?, } } @@ -128,14 +136,27 @@ impl Daemon { bail!("there is already a running dataflow with ID `{dataflow_id}`") } }; + for (node_id, params) in nodes { for (input_id, mapping) in params.node.run_config.inputs.clone() { - if let InputMapping::User(mapping) = mapping { - dataflow - .mappings - .entry((mapping.source, mapping.output)) - .or_default() - .insert((node_id.clone(), input_id)); + match mapping { + InputMapping::User(mapping) => { + if mapping.operator.is_some() { + bail!("operators are not supported"); + } + dataflow + .mappings + .entry((mapping.source, mapping.output)) + .or_default() + .insert((node_id.clone(), input_id)); + } + InputMapping::Timer { interval } => { + dataflow + .timers + .entry(interval) + .or_default() + .insert((node_id.clone(), input_id)); + } } } @@ -145,7 +166,31 @@ impl Daemon { dataflow.node_tasks.insert(node_id, task); } - // TODO: spawn timers + // spawn timer tasks + for interval in dataflow.timers.keys().copied() { + let events_tx = self.dora_events_tx.clone(); + let task = async move { + let mut interval_stream = tokio::time::interval(interval); + let hlc = HLC::default(); + loop { + interval_stream.tick().await; + + let event = DoraEvent::Timer { + dataflow_id, + interval, + metadata: dora_message::Metadata::from_parameters( + hlc.new_timestamp(), + Default::default(), + ), + }; + if events_tx.send(event).await.is_err() { + break; + } + } + }; + tokio::spawn(task); + } + Ok(()) } } @@ -216,7 +261,7 @@ impl Daemon { .send_async(daemon_messages::NodeEvent::Input { id: input_id.clone(), metadata: metadata.clone(), - data: unsafe { daemon_messages::InputData::new(id.clone()) }, + data: Some(unsafe { daemon_messages::InputData::new(id.clone()) }), }) .await .is_err() @@ -245,6 +290,49 @@ impl Daemon { } Ok(()) } + + async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { + match event { + DoraEvent::Timer { + dataflow_id, + interval, + metadata, + } => { + let Some(dataflow) = self.running.get_mut(&dataflow_id) else { + tracing::warn!("Timer event for unknown dataflow `{dataflow_id}`"); + return Ok(()) + }; + + let Some(subscribers) = dataflow.timers.get(&interval) else { + return Ok(()); + }; + + let mut closed = Vec::new(); + for (receiver_id, input_id) in subscribers { + let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { + continue; + }; + + if channel + .send_async(daemon_messages::NodeEvent::Input { + id: input_id.clone(), + metadata: metadata.clone(), + data: None, + }) + .await + .is_err() + { + closed.push(receiver_id); + } + } + for id in closed { + dataflow.subscribe_channels.remove(id); + } + + Ok(()) + } + } + } } #[derive(Default)] @@ -252,6 +340,7 @@ pub struct RunningDataflow { subscribe_channels: HashMap>, node_tasks: HashMap>>, mappings: HashMap>, + timers: BTreeMap>, } type OutputId = (NodeId, DataId); @@ -267,6 +356,7 @@ pub enum Event { reply_sender: oneshot::Sender, }, Coordinator(DaemonCoordinatorEvent), + Dora(DoraEvent), } #[derive(Debug)] @@ -285,6 +375,14 @@ pub enum DaemonNodeEvent { }, } +pub enum DoraEvent { + Timer { + dataflow_id: DataflowId, + interval: Duration, + metadata: dora_message::Metadata<'static>, + }, +} + type MessageId = String; fn set_up_tracing() -> eyre::Result<()> { diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index ee12b7f0..59f631f0 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -1,5 +1,5 @@ use dora_node_api::{self, dora_core::daemon_messages::NodeEvent, DoraNode}; -use eyre::{bail, Context}; +use eyre::{bail, Context, ContextCompat}; fn main() -> eyre::Result<()> { let (_node, events) = DoraNode::init_from_env()?; @@ -13,7 +13,7 @@ fn main() -> eyre::Result<()> { data, } => match id.as_str() { "message" => { - let data = data.map()?; + let data = data.wrap_err("no data")?.map()?; let received_string = std::str::from_utf8(&data) .wrap_err("received message was not utf8-encoded")?; println!("received message: {}", received_string); diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 7a1eaf4a..a0494535 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -52,7 +52,7 @@ pub enum NodeEvent { Input { id: DataId, metadata: Metadata<'static>, - data: InputData, + data: Option, }, } From 3cd1b4de65ddc143a79a533d7ed7ad497f98fda3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 13:33:13 +0100 Subject: [PATCH 023/225] Notify downstream nodes about closed inputs --- binaries/daemon/src/main.rs | 27 ++++++++++++++++++++++++- examples/rust-dataflow/node/src/main.rs | 1 + examples/rust-dataflow/sink/src/main.rs | 5 +++++ libraries/core/src/daemon_messages.rs | 4 ++++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 1352718a..4286fd36 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -283,9 +283,34 @@ impl Daemon { let _ = reply_sender.send(ControlReply::Result(Ok(()))); } DaemonNodeEvent::Stopped => { - // TODO send stop message to downstream nodes + tracing::info!("Stopped: {dataflow_id}/{node_id}"); let _ = reply_sender.send(ControlReply::Result(Ok(()))); + + // notify downstream nodes + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + let downstream_nodes: BTreeSet<_> = dataflow + .mappings + .iter() + .filter(|((source_id, _), _)| source_id == &node_id) + .flat_map(|(_, v)| v) + .collect(); + for (receiver_id, input_id) in downstream_nodes { + let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { + continue; + }; + + let _ = channel + .send_async(daemon_messages::NodeEvent::InputClosed { + id: input_id.clone(), + }) + .await; + } + + // TODO: notify remote nodes } } Ok(()) diff --git a/examples/rust-dataflow/node/src/main.rs b/examples/rust-dataflow/node/src/main.rs index ce7408df..442932e7 100644 --- a/examples/rust-dataflow/node/src/main.rs +++ b/examples/rust-dataflow/node/src/main.rs @@ -31,6 +31,7 @@ fn main() -> eyre::Result<()> { } other => eprintln!("Ignoring unexpected input `{other}`"), }, + other => eprintln!("Received unexpected input: {other:?}"), } } diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index 59f631f0..57e1f026 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -26,6 +26,11 @@ fn main() -> eyre::Result<()> { } other => eprintln!("Ignoring unexpected input `{other}`"), }, + NodeEvent::InputClosed { id } => { + println!("Input `{id}` was closed -> exiting"); + break; + } + other => eprintln!("Received unexpected input: {other:?}"), } } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index a0494535..e0021b88 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -47,6 +47,7 @@ pub enum ControlReply { } #[derive(Debug, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] pub enum NodeEvent { Stop, Input { @@ -54,6 +55,9 @@ pub enum NodeEvent { metadata: Metadata<'static>, data: Option, }, + InputClosed { + id: DataId, + }, } #[derive(Debug, serde::Serialize, serde::Deserialize)] From f0e31e2cc6df6fe177ece15a282a81b465eb5b47 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 13:54:48 +0100 Subject: [PATCH 024/225] Report results of spawned nodes as events --- binaries/daemon/src/main.rs | 45 ++++++++++++++++++++++++++++++++---- binaries/daemon/src/spawn.rs | 21 +++++++++++++---- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 4286fd36..46a464d2 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -160,10 +160,9 @@ impl Daemon { } } - let task = spawn::spawn_node(dataflow_id, params, self.port) + spawn::spawn_node(dataflow_id, params, self.port, self.dora_events_tx.clone()) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; - dataflow.node_tasks.insert(node_id, task); } // spawn timer tasks @@ -311,6 +310,12 @@ impl Daemon { } // TODO: notify remote nodes + + dataflow.subscribe_channels.remove(&node_id); + if dataflow.subscribe_channels.is_empty() { + tracing::info!("Dataflow `{dataflow_id}` finished"); + self.running.remove(&dataflow_id); + } } } Ok(()) @@ -353,17 +358,42 @@ impl Daemon { for id in closed { dataflow.subscribe_channels.remove(id); } - - Ok(()) + } + DoraEvent::SpawnedNodeResult { + dataflow_id, + node_id, + result, + } => { + if self + .running + .get(&dataflow_id) + .and_then(|d| d.subscribe_channels.get(&node_id)) + .is_some() + { + tracing::warn!( + "node `{dataflow_id}/{node_id}` finished without sending `Stopped` message" + ); + } + match result { + Ok(()) => { + tracing::info!("node {dataflow_id}/{node_id} finished"); + } + Err(err) => { + tracing::error!( + "{:?}", + err.wrap_err(format!("error in node `{dataflow_id}/{node_id}`")) + ); + } + } } } + Ok(()) } } #[derive(Default)] pub struct RunningDataflow { subscribe_channels: HashMap>, - node_tasks: HashMap>>, mappings: HashMap>, timers: BTreeMap>, } @@ -406,6 +436,11 @@ pub enum DoraEvent { interval: Duration, metadata: dora_message::Metadata<'static>, }, + SpawnedNodeResult { + dataflow_id: DataflowId, + node_id: NodeId, + result: eyre::Result<()>, + }, } type MessageId = String; diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 5fcc8237..3fd98429 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,3 +1,4 @@ +use crate::DoraEvent; use dora_core::{ daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, @@ -5,13 +6,15 @@ use dora_core::{ use dora_download::download_file; use eyre::{eyre, WrapErr}; use std::{env::consts::EXE_EXTENSION, path::Path}; +use tokio::sync::mpsc; #[tracing::instrument] pub async fn spawn_node( dataflow_id: DataflowId, params: SpawnNodeParams, daemon_port: u16, -) -> eyre::Result>> { + result_tx: mpsc::Sender, +) -> eyre::Result<()> { let SpawnNodeParams { node_id, node, @@ -63,16 +66,26 @@ pub async fn spawn_node( node.args.as_deref().unwrap_or_default() ) })?; - let result = tokio::spawn(async move { + let node_id_cloned = node_id.clone(); + let wait_task = async move { let status = child.wait().await.context("child process failed")?; if status.success() { - tracing::info!("node {node_id} finished"); Ok(()) } else if let Some(code) = status.code() { Err(eyre!("node {node_id} failed with exit code: {code}")) } else { Err(eyre!("node {node_id} failed (unknown exit code)")) } + }; + tokio::spawn(async move { + let result = wait_task.await; + let _ = result_tx + .send(DoraEvent::SpawnedNodeResult { + dataflow_id, + node_id: node_id_cloned, + result, + }) + .await; }); - Ok(result) + Ok(()) } From 3c4469d4d3e0472d05871ed2d1f5f37c9254680d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 14:04:05 +0100 Subject: [PATCH 025/225] Cancel timer tasks once a dataflow is finished --- Cargo.lock | 1 + binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/main.rs | 5 +++++ 3 files changed, 7 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 3d187199..a806a614 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -975,6 +975,7 @@ dependencies = [ "dora-message", "eyre", "flume", + "futures", "futures-concurrency 7.0.0", "serde", "serde_json", diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 55f6a27d..2b33f259 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -21,3 +21,4 @@ flume = "0.10.14" dora-download = { path = "../../libraries/extensions/download" } serde_yaml = "0.8.23" uuid = { version = "1.1.2", features = ["v4"] } +futures = "0.3.25" diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 46a464d2..8e54867e 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -5,6 +5,7 @@ use dora_core::{ }; use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; +use futures::FutureExt; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ @@ -187,7 +188,9 @@ impl Daemon { } } }; + let (task, handle) = task.remote_handle(); tokio::spawn(task); + dataflow._timer_handles.push(handle); } Ok(()) @@ -396,6 +399,8 @@ pub struct RunningDataflow { subscribe_channels: HashMap>, mappings: HashMap>, timers: BTreeMap>, + /// Keep handles to all timer tasks of this dataflow to cancel them on drop. + _timer_handles: Vec>, } type OutputId = (NodeId, DataId); From 692595dd6f2cf5be283c5d210673d79a601b6bd5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 17:41:14 +0100 Subject: [PATCH 026/225] Allow replacing a daemon connection in coordinator The previous connection might be closed already. It's difficult to check for that without sending any actual data, so we just permit re-registering of daemon connections. Previous connections are closed. --- binaries/coordinator/src/lib.rs | 45 +++++++++++++++++---------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index dc12ef20..0bbdf3d3 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -13,8 +13,10 @@ use futures::StreamExt; use futures_concurrency::stream::Merge; use run::{await_tasks, SpawnedDataflow}; use std::{ - collections::HashMap, + collections::{hash_map, HashMap}, + io::ErrorKind, path::{Path, PathBuf}, + time::Duration, }; use tokio::net::TcpStream; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; @@ -96,7 +98,7 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .merge(); let mut running_dataflows = HashMap::new(); - let mut daemon_connections = HashMap::new(); + let mut daemon_connections: HashMap<_, TcpStream> = HashMap::new(); while let Some(event) = events.next().await { tracing::trace!("Handling event {event:?}"); @@ -108,29 +110,28 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { Event::DaemonConnectError(err) => { tracing::warn!("{:?}", err.wrap_err("failed to connect to dora-daemon")); } - Event::Daemon(event) => match event { - DaemonEvent::Register { - machine_id, - mut connection, - } => match daemon_connections.entry(machine_id) { - std::collections::hash_map::Entry::Vacant(entry) => { + Event::Daemon(event) => { + match event { + DaemonEvent::Register { + machine_id, + mut connection, + } => { let reply = RegisterResult::Ok; - if tcp_send(&mut connection, &serde_json::to_vec(&reply)?) - .await - .is_ok() - { - entry.insert(connection); + match tcp_send(&mut connection, &serde_json::to_vec(&reply)?).await { + Ok(()) => { + let previous = + daemon_connections.insert(machine_id.clone(), connection); + if let Some(_previous) = previous { + tracing::info!("closing previous connection `{machine_id}` on new register"); + } + } + Err(err) => { + tracing::warn!("failed to register daemon connection for machine `{machine_id}`: {err}"); + } } } - std::collections::hash_map::Entry::Occupied(entry) => { - let reply = RegisterResult::Err(format!( - "there is already a daemon connection for machine `{}`", - entry.key() - )); - let _ = tcp_send(&mut connection, &serde_json::to_vec(&reply)?).await; - } - }, - }, + } + } Event::Dataflow { uuid, event } => match event { DataflowEvent::Finished { result } => { running_dataflows.remove(&uuid); From 8bfb665201791c7b371424640e36a803075e2ff6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 13 Dec 2022 17:59:54 +0100 Subject: [PATCH 027/225] Set `TCP_NODELAY` for all TCP connections to reduce latency --- apis/rust/node/src/daemon.rs | 6 ++++++ binaries/daemon/src/coordinator.rs | 3 +++ 2 files changed, 9 insertions(+) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 975a1d3c..85d25825 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -105,6 +105,9 @@ fn init_event_stream( ) -> eyre::Result { let mut event_stream = TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; + event_stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; tcp_send( &mut event_stream, &ControlRequest::Subscribe { @@ -152,6 +155,9 @@ fn init_control_stream( ) -> eyre::Result { let mut control_stream = TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; + control_stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; tcp_send( &mut control_stream, &ControlRequest::Register { diff --git a/binaries/daemon/src/coordinator.rs b/binaries/daemon/src/coordinator.rs index 457b79c0..b846c697 100644 --- a/binaries/daemon/src/coordinator.rs +++ b/binaries/daemon/src/coordinator.rs @@ -12,6 +12,9 @@ pub async fn connect(addr: SocketAddr) -> eyre::Result Date: Wed, 14 Dec 2022 11:36:58 +0100 Subject: [PATCH 028/225] Make events a borrowed type and report to daemon when they're dropped --- apis/rust/node/src/daemon.rs | 114 ++++++++++++++++++++++++-- libraries/core/src/daemon_messages.rs | 25 +----- 2 files changed, 110 insertions(+), 29 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 85d25825..ce92abe1 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,15 +1,17 @@ use std::{ io::{ErrorKind, Read, Write}, + marker::PhantomData, net::{Ipv4Addr, SocketAddr, TcpStream}, + time::Duration, }; use dora_core::{ config::{DataId, NodeId}, daemon_messages::{ControlRequest, DataflowId, NodeEvent}, }; +use dora_message::Metadata; use eyre::{bail, eyre, Context}; - -pub type EventStream = flume::Receiver; +use shared_memory::{Shmem, ShmemConf}; pub struct DaemonConnection { pub control_channel: ControlChannel, @@ -94,6 +96,38 @@ impl ControlChannel { } } +pub struct EventStream { + receiver: flume::Receiver<(NodeEvent, std::sync::mpsc::Sender<()>)>, +} + +impl EventStream { + pub fn recv(&mut self) -> Option { + let (node_event, ack) = match self.receiver.recv() { + Ok(d) => d, + Err(flume::RecvError::Disconnected) => return None, + }; + let event = match node_event { + NodeEvent::Stop => Event::Stop, + NodeEvent::InputClosed { id } => Event::InputClosed { id }, + NodeEvent::Input { id, metadata, data } => { + let mapped = data + .map(|d| unsafe { MappedInputData::map(&d.shared_memory_id) }) + .transpose(); + match mapped { + Ok(mapped) => Event::Input { + id, + metadata, + data: mapped.map(|data| Data { data, _ack: ack }), + }, + Err(err) => Event::Error(format!("{err:?}")), + } + } + }; + + Some(event) + } +} + pub struct MessageSample { pub id: String, } @@ -127,7 +161,7 @@ fn init_event_stream( let (tx, rx) = flume::bounded(1); std::thread::spawn(move || loop { - let event = match tcp_receive(&mut event_stream) { + let event: NodeEvent = match tcp_receive(&mut event_stream) { Ok(event) => event, Err(err) if err.kind() == ErrorKind::UnexpectedEof => break, Err(err) => { @@ -136,16 +170,86 @@ fn init_event_stream( continue; } }; - match tx.send(event) { + + let (ack_tx, ack_rx) = std::sync::mpsc::channel(); + match tx.send((event, ack_tx)) { Ok(()) => {} Err(_) => { // receiving end of channel was closed break; } } + + match ack_rx.recv_timeout(Duration::from_secs(30)) { + Ok(()) => panic!("Node API should not send anything on ACK channel"), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + tracing::warn!("timeout while waiting for input ACK"); + } + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {} // expected result + } }); - Ok(rx) + Ok(EventStream { receiver: rx }) +} + +#[derive(Debug)] +#[non_exhaustive] +pub enum Event<'a> { + Stop, + Input { + id: DataId, + metadata: Metadata<'static>, + data: Option>, + }, + InputClosed { + id: DataId, + }, + Error(String), +} + +pub struct Data<'a> { + data: MappedInputData<'a>, + _ack: std::sync::mpsc::Sender<()>, +} + +impl std::ops::Deref for Data<'_> { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + &self.data + } +} + +impl std::fmt::Debug for Data<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Data").finish_non_exhaustive() + } +} + +pub struct MappedInputData<'a> { + memory: Shmem, + _data: PhantomData<&'a [u8]>, +} + +impl MappedInputData<'_> { + unsafe fn map(shared_memory_id: &str) -> eyre::Result { + let memory = ShmemConf::new() + .os_id(shared_memory_id) + .open() + .wrap_err("failed to map shared memory input")?; + Ok(MappedInputData { + memory, + _data: PhantomData, + }) + } +} + +impl std::ops::Deref for MappedInputData<'_> { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + unsafe { self.memory.as_slice() } + } } fn init_control_stream( diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index e0021b88..7129ea9b 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -5,8 +5,6 @@ use crate::{ descriptor, }; use dora_message::Metadata; -use eyre::Context; -use shared_memory::{Shmem, ShmemConf}; use uuid::Uuid; #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -47,7 +45,6 @@ pub enum ControlReply { } #[derive(Debug, serde::Serialize, serde::Deserialize)] -#[non_exhaustive] pub enum NodeEvent { Stop, Input { @@ -62,33 +59,13 @@ pub enum NodeEvent { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct InputData { - shared_memory_id: SharedMemoryId, + pub shared_memory_id: SharedMemoryId, } impl InputData { pub unsafe fn new(shared_memory_id: SharedMemoryId) -> Self { Self { shared_memory_id } } - - pub fn map(self) -> eyre::Result { - let memory = ShmemConf::new() - .os_id(self.shared_memory_id) - .open() - .wrap_err("failed to map shared memory input")?; - Ok(MappedInputData { memory }) - } -} - -pub struct MappedInputData { - memory: Shmem, -} - -impl std::ops::Deref for MappedInputData { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - unsafe { self.memory.as_slice() } - } } #[derive(Debug, serde::Deserialize, serde::Serialize)] From a2cc06ca504cb3eae177651c636cd4303604d1a9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 14 Dec 2022 12:24:47 +0100 Subject: [PATCH 029/225] Use drop tokens and reference counting to free shared memory again after usage --- apis/rust/node/src/daemon.rs | 211 ++++++++++++++------------ binaries/daemon/src/listener.rs | 30 +++- binaries/daemon/src/main.rs | 35 ++++- binaries/daemon/src/tcp_utils.rs | 12 +- libraries/core/src/daemon_messages.rs | 21 ++- 5 files changed, 192 insertions(+), 117 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index ce92abe1..68bc0377 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -7,7 +7,7 @@ use std::{ use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{ControlRequest, DataflowId, NodeEvent}, + daemon_messages::{ControlRequest, DataflowId, DropEvent, NodeEvent}, }; use dora_message::Metadata; use eyre::{bail, eyre, Context}; @@ -21,14 +21,14 @@ pub struct DaemonConnection { impl DaemonConnection { pub fn init(dataflow_id: DataflowId, node_id: &NodeId, daemon_port: u16) -> eyre::Result { let daemon_addr = (Ipv4Addr::new(127, 0, 0, 1), daemon_port).into(); - let control_stream = init_control_stream(daemon_addr, dataflow_id, &node_id) + let control_channel = ControlChannel::init(daemon_addr, dataflow_id, node_id) .wrap_err("failed to init control stream")?; - let event_stream = init_event_stream(daemon_addr, dataflow_id, &node_id) + let event_stream = EventStream::init(daemon_addr, dataflow_id, node_id) .wrap_err("failed to init event stream")?; Ok(Self { - control_channel: ControlChannel(control_stream), + control_channel, event_stream, }) } @@ -37,6 +37,35 @@ impl DaemonConnection { pub struct ControlChannel(TcpStream); impl ControlChannel { + fn init( + daemon_addr: SocketAddr, + dataflow_id: DataflowId, + node_id: &NodeId, + ) -> eyre::Result { + let mut control_stream = + TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; + control_stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; + tcp_send( + &mut control_stream, + &ControlRequest::Register { + dataflow_id, + node_id: node_id.clone(), + }, + ) + .wrap_err("failed to send register request to dora-daemon")?; + match tcp_receive(&mut control_stream) + .wrap_err("failed to receive register reply from dora-daemon")? + { + dora_core::daemon_messages::ControlReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to register node with dora-daemon")?, + other => bail!("unexpected register reply: {other:?}"), + } + Ok(Self(control_stream)) + } + pub fn report_stop(&mut self) -> eyre::Result<()> { tcp_send(&mut self.0, &ControlRequest::Stopped) .wrap_err("failed to send subscribe request to dora-daemon")?; @@ -101,8 +130,84 @@ pub struct EventStream { } impl EventStream { + fn init( + daemon_addr: SocketAddr, + dataflow_id: DataflowId, + node_id: &NodeId, + ) -> eyre::Result { + let mut event_stream = + TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; + event_stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; + tcp_send( + &mut event_stream, + &ControlRequest::Subscribe { + dataflow_id, + node_id: node_id.clone(), + }, + ) + .wrap_err("failed to send subscribe request to dora-daemon")?; + match tcp_receive(&mut event_stream) + .wrap_err("failed to receive subscribe reply from dora-daemon")? + { + dora_core::daemon_messages::ControlReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to create subscription with dora-daemon")?, + other => bail!("unexpected subscribe reply: {other:?}"), + } + + let (tx, rx) = flume::bounded(1); + std::thread::spawn(move || loop { + let event: NodeEvent = match tcp_receive(&mut event_stream) { + Ok(event) => event, + Err(err) if err.kind() == ErrorKind::UnexpectedEof => break, + Err(err) => { + let err = eyre!(err).wrap_err("failed to receive incoming event"); + tracing::warn!("{err:?}"); + continue; + } + }; + let drop_token = match &event { + NodeEvent::Input { + data: Some(data), .. + } => Some(data.drop_token.clone()), + NodeEvent::Stop + | NodeEvent::InputClosed { .. } + | NodeEvent::Input { data: None, .. } => None, + }; + + let (drop_tx, drop_rx) = std::sync::mpsc::channel(); + match tx.send((event, drop_tx)) { + Ok(()) => {} + Err(_) => { + // receiving end of channel was closed + break; + } + } + + match drop_rx.recv_timeout(Duration::from_secs(30)) { + Ok(()) => panic!("Node API should not send anything on ACK channel"), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + tracing::warn!("timeout while waiting for input ACK"); + } + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {} // expected result + } + + if let Some(token) = drop_token { + let message = DropEvent { token }; + if let Err(err) = tcp_send(&mut event_stream, &message) { + tracing::warn!("failed to send drop token: {err}"); + break; + } + } + }); + + Ok(EventStream { receiver: rx }) + } + pub fn recv(&mut self) -> Option { - let (node_event, ack) = match self.receiver.recv() { + let (node_event, drop_sender) = match self.receiver.recv() { Ok(d) => d, Err(flume::RecvError::Disconnected) => return None, }; @@ -117,7 +222,10 @@ impl EventStream { Ok(mapped) => Event::Input { id, metadata, - data: mapped.map(|data| Data { data, _ack: ack }), + data: mapped.map(|data| Data { + data, + _drop: drop_sender, + }), }, Err(err) => Event::Error(format!("{err:?}")), } @@ -132,66 +240,6 @@ pub struct MessageSample { pub id: String, } -fn init_event_stream( - daemon_addr: SocketAddr, - dataflow_id: DataflowId, - node_id: &NodeId, -) -> eyre::Result { - let mut event_stream = - TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; - event_stream - .set_nodelay(true) - .wrap_err("failed to set TCP_NODELAY")?; - tcp_send( - &mut event_stream, - &ControlRequest::Subscribe { - dataflow_id, - node_id: node_id.clone(), - }, - ) - .wrap_err("failed to send subscribe request to dora-daemon")?; - match tcp_receive(&mut event_stream) - .wrap_err("failed to receive subscribe reply from dora-daemon")? - { - dora_core::daemon_messages::ControlReply::Result(result) => result - .map_err(|e| eyre!(e)) - .wrap_err("failed to create subscription with dora-daemon")?, - other => bail!("unexpected subscribe reply: {other:?}"), - } - - let (tx, rx) = flume::bounded(1); - std::thread::spawn(move || loop { - let event: NodeEvent = match tcp_receive(&mut event_stream) { - Ok(event) => event, - Err(err) if err.kind() == ErrorKind::UnexpectedEof => break, - Err(err) => { - let err = eyre!(err).wrap_err("failed to receive incoming event"); - tracing::warn!("{err:?}"); - continue; - } - }; - - let (ack_tx, ack_rx) = std::sync::mpsc::channel(); - match tx.send((event, ack_tx)) { - Ok(()) => {} - Err(_) => { - // receiving end of channel was closed - break; - } - } - - match ack_rx.recv_timeout(Duration::from_secs(30)) { - Ok(()) => panic!("Node API should not send anything on ACK channel"), - Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { - tracing::warn!("timeout while waiting for input ACK"); - } - Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {} // expected result - } - }); - - Ok(EventStream { receiver: rx }) -} - #[derive(Debug)] #[non_exhaustive] pub enum Event<'a> { @@ -209,7 +257,7 @@ pub enum Event<'a> { pub struct Data<'a> { data: MappedInputData<'a>, - _ack: std::sync::mpsc::Sender<()>, + _drop: std::sync::mpsc::Sender<()>, } impl std::ops::Deref for Data<'_> { @@ -252,35 +300,6 @@ impl std::ops::Deref for MappedInputData<'_> { } } -fn init_control_stream( - daemon_addr: SocketAddr, - dataflow_id: DataflowId, - node_id: &NodeId, -) -> eyre::Result { - let mut control_stream = - TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; - control_stream - .set_nodelay(true) - .wrap_err("failed to set TCP_NODELAY")?; - tcp_send( - &mut control_stream, - &ControlRequest::Register { - dataflow_id, - node_id: node_id.clone(), - }, - ) - .wrap_err("failed to send register request to dora-daemon")?; - match tcp_receive(&mut control_stream) - .wrap_err("failed to receive register reply from dora-daemon")? - { - dora_core::daemon_messages::ControlReply::Result(result) => result - .map_err(|e| eyre!(e)) - .wrap_err("failed to register node with dora-daemon")?, - other => bail!("unexpected register reply: {other:?}"), - } - Ok(control_stream) -} - fn tcp_send(connection: &mut TcpStream, request: &T) -> std::io::Result<()> { let serialized = serde_json::to_vec(request)?; diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index f3b0d3dc..ee75a4c5 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -2,7 +2,7 @@ use crate::{ tcp_utils::{tcp_receive, tcp_send}, DaemonNodeEvent, Event, }; -use dora_core::daemon_messages; +use dora_core::daemon_messages::{self, DropEvent}; use eyre::{eyre, Context}; use std::{io::ErrorKind, net::Ipv4Addr}; use tokio::{ @@ -136,16 +136,38 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende // enter subscribe loop after receiving a subscribe message if let Some(events) = enter_subscribe_loop { - subscribe_loop(connection, events).await; + subscribe_loop(connection, events, events_tx).await; break; // the subscribe loop only exits when the connection was closed } } } async fn subscribe_loop( - mut connection: TcpStream, + connection: TcpStream, events: flume::Receiver, + events_tx: mpsc::Sender, ) { + let (mut rx, mut tx) = connection.into_split(); + + tokio::spawn(async move { + loop { + let Ok(raw) = tcp_receive(&mut rx).await else { + break; + }; + + let event: DropEvent = match serde_json::from_slice(&raw) { + Ok(e) => e, + Err(err) => { + tracing::error!("Failed to parse incoming message: {err}"); + continue; + } + }; + if events_tx.send(Event::Drop(event)).await.is_err() { + break; + } + } + }); + while let Some(event) = events.stream().next().await { let message = match serde_json::to_vec(&event) { Ok(m) => m, @@ -155,7 +177,7 @@ async fn subscribe_loop( continue; } }; - match tcp_send(&mut connection, &message).await { + match tcp_send(&mut tx, &message).await { Ok(()) => {} Err(err) if err.kind() == ErrorKind::UnexpectedEof diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 8e54867e..faedde5b 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,6 +1,9 @@ use dora_core::{ config::{DataId, InputMapping, NodeId}, - daemon_messages::{self, ControlReply, DaemonCoordinatorEvent, DataflowId, SpawnDataflowNodes}, + daemon_messages::{ + self, ControlReply, DaemonCoordinatorEvent, DataflowId, DropEvent, DropToken, + SpawnDataflowNodes, + }, topics::DORA_COORDINATOR_PORT_DEFAULT, }; use dora_message::uhlc::HLC; @@ -11,6 +14,7 @@ use shared_memory::{Shmem, ShmemConf}; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, net::{Ipv4Addr, SocketAddr}, + rc::Rc, time::Duration, }; use tokio::{ @@ -47,7 +51,7 @@ async fn run() -> eyre::Result<()> { struct Daemon { port: u16, uninit_shared_memory: HashMap, Shmem)>, - sent_out_shared_memory: HashMap, + sent_out_shared_memory: HashMap>, running: HashMap, @@ -117,6 +121,18 @@ impl Daemon { .await? } Event::Dora(event) => self.handle_dora_event(event).await?, + Event::Drop(DropEvent { token }) => { + match self.sent_out_shared_memory.remove(&token) { + Some(rc) => { + if let Ok(_shmem) = Rc::try_unwrap(rc) { + tracing::trace!( + "freeing shared memory after receiving last drop token" + ) + } + } + None => tracing::warn!("received unknown drop token {token:?}"), + } + } } } @@ -243,6 +259,8 @@ impl Daemon { .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; + let memory = Rc::new(memory); + let dataflow = self .running .get_mut(&dataflow_id) @@ -259,17 +277,24 @@ impl Daemon { let mut closed = Vec::new(); for (receiver_id, input_id) in local_receivers { if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { + let drop_token = DropToken::generate(); if channel .send_async(daemon_messages::NodeEvent::Input { id: input_id.clone(), metadata: metadata.clone(), - data: Some(unsafe { daemon_messages::InputData::new(id.clone()) }), + data: Some(daemon_messages::InputData { + shared_memory_id: id.clone(), + drop_token: drop_token.clone(), + }), }) .await .is_err() { closed.push(receiver_id); } + // keep shared memory ptr in order to free it once all subscribers are done + self.sent_out_shared_memory + .insert(drop_token, memory.clone()); } } for id in closed { @@ -279,9 +304,6 @@ impl Daemon { // TODO send `data` via network to all remove receivers let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); - // keep shared memory ptr in order to free it once all subscribers are done - self.sent_out_shared_memory.insert(id, memory); - let _ = reply_sender.send(ControlReply::Result(Ok(()))); } DaemonNodeEvent::Stopped => { @@ -417,6 +439,7 @@ pub enum Event { }, Coordinator(DaemonCoordinatorEvent), Dora(DoraEvent), + Drop(DropEvent), } #[derive(Debug)] diff --git a/binaries/daemon/src/tcp_utils.rs b/binaries/daemon/src/tcp_utils.rs index 31f5e3b5..b6c31e30 100644 --- a/binaries/daemon/src/tcp_utils.rs +++ b/binaries/daemon/src/tcp_utils.rs @@ -1,16 +1,16 @@ -use tokio::{ - io::{AsyncReadExt, AsyncWriteExt}, - net::TcpStream, -}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; -pub async fn tcp_send(connection: &mut TcpStream, message: &[u8]) -> std::io::Result<()> { +pub async fn tcp_send( + connection: &mut (impl AsyncWrite + Unpin), + message: &[u8], +) -> std::io::Result<()> { let len_raw = (message.len() as u64).to_le_bytes(); connection.write_all(&len_raw).await?; connection.write_all(message).await?; Ok(()) } -pub async fn tcp_receive(connection: &mut TcpStream) -> std::io::Result> { +pub async fn tcp_receive(connection: &mut (impl AsyncRead + Unpin)) -> std::io::Result> { let reply_len = { let mut raw = [0; 8]; connection.read_exact(&mut raw).await?; diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 7129ea9b..54e0b3fd 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -58,16 +58,27 @@ pub enum NodeEvent { } #[derive(Debug, serde::Serialize, serde::Deserialize)] -pub struct InputData { - pub shared_memory_id: SharedMemoryId, +pub struct DropEvent { + pub token: DropToken, } -impl InputData { - pub unsafe fn new(shared_memory_id: SharedMemoryId) -> Self { - Self { shared_memory_id } +#[derive( + Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, serde::Serialize, serde::Deserialize, +)] +pub struct DropToken(Uuid); + +impl DropToken { + pub fn generate() -> Self { + Self(Uuid::new_v4()) } } +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct InputData { + pub shared_memory_id: SharedMemoryId, + pub drop_token: DropToken, +} + #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorEvent { Spawn(SpawnDataflowNodes), From 424561e74b08b74f17c654798c129b8ed47d649b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 14 Dec 2022 15:27:50 +0100 Subject: [PATCH 030/225] Update rust-dataflow example to work with latest daemon code --- examples/rust-dataflow/dataflow.yml | 25 +++++++++++----------- examples/rust-dataflow/node/src/main.rs | 21 +++++++++---------- examples/rust-dataflow/sink/src/main.rs | 28 ++++++++++--------------- 3 files changed, 34 insertions(+), 40 deletions(-) diff --git a/examples/rust-dataflow/dataflow.yml b/examples/rust-dataflow/dataflow.yml index 78b8a6cb..eb5e5995 100644 --- a/examples/rust-dataflow/dataflow.yml +++ b/examples/rust-dataflow/dataflow.yml @@ -8,22 +8,23 @@ nodes: build: cargo build -p rust-dataflow-example-node source: ../../target/debug/rust-dataflow-example-node inputs: - tick: dora/timer/millis/300 + tick: dora/timer/millis/3 outputs: - random - - id: runtime-node - operators: - - id: rust-operator - build: cargo build -p rust-dataflow-example-operator - shared-library: ../../target/debug/rust_dataflow_example_operator - inputs: - tick: dora/timer/millis/100 - random: rust-node/random - outputs: - - status + # - id: runtime-node + # operators: + # - id: rust-operator + # build: cargo build -p rust-dataflow-example-operator + # shared-library: ../../target/debug/rust_dataflow_example_operator + # inputs: + # tick: dora/timer/millis/100 + # random: rust-node/random + # outputs: + # - status - id: rust-sink custom: build: cargo build -p rust-dataflow-example-sink source: ../../target/debug/rust-dataflow-example-sink inputs: - message: runtime-node/rust-operator/status + # message: runtime-node/rust-operator/status + message: rust-node/random diff --git a/examples/rust-dataflow/node/src/main.rs b/examples/rust-dataflow/node/src/main.rs index 442932e7..17f635b0 100644 --- a/examples/rust-dataflow/node/src/main.rs +++ b/examples/rust-dataflow/node/src/main.rs @@ -1,29 +1,28 @@ -use dora_node_api::{ - self, - dora_core::{config::DataId, daemon_messages::NodeEvent}, - DoraNode, -}; +use dora_node_api::{self, daemon::Event, dora_core::config::DataId, DoraNode}; fn main() -> eyre::Result<()> { + println!("hello"); + let output = DataId::from("random".to_owned()); - let (mut node, events) = DoraNode::init_from_env()?; + let (mut node, mut events) = DoraNode::init_from_env()?; - for _ in 0..20 { + for i in 0..100 { let event = match events.recv() { - Ok(input) => input, - Err(_) => break, + Some(input) => input, + None => break, }; match event { - NodeEvent::Stop => break, - NodeEvent::Input { + Event::Stop => break, + Event::Input { id, metadata, data: _, } => match id.as_str() { "tick" => { let random: u64 = rand::random(); + println!("tick {i}, sending {random:#x}"); let data: &[u8] = &random.to_le_bytes(); node.send_output(output.clone(), metadata.parameters, data.len(), |out| { out.copy_from_slice(data); diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index 57e1f026..20a0931f 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -1,32 +1,26 @@ -use dora_node_api::{self, dora_core::daemon_messages::NodeEvent, DoraNode}; -use eyre::{bail, Context, ContextCompat}; +use dora_node_api::{self, daemon::Event, DoraNode}; +use eyre::ContextCompat; fn main() -> eyre::Result<()> { - let (_node, events) = DoraNode::init_from_env()?; + let (_node, mut events) = DoraNode::init_from_env()?; - while let Ok(event) = events.recv() { + while let Some(event) = events.recv() { match event { - NodeEvent::Stop => break, - NodeEvent::Input { + Event::Stop => break, + Event::Input { id, metadata: _, data, } => match id.as_str() { "message" => { - let data = data.wrap_err("no data")?.map()?; - let received_string = std::str::from_utf8(&data) - .wrap_err("received message was not utf8-encoded")?; - println!("received message: {}", received_string); - if !received_string.starts_with("operator received random value ") { - bail!("unexpected message format (should start with 'operator received random value')") - } - if !received_string.ends_with(" ticks") { - bail!("unexpected message format (should end with 'ticks')") - } + let data = data.wrap_err("no data")?; + let raw = (&data[..]).try_into().unwrap(); + + println!("received data: {:#x}", u64::from_le_bytes(raw)); } other => eprintln!("Ignoring unexpected input `{other}`"), }, - NodeEvent::InputClosed { id } => { + Event::InputClosed { id } => { println!("Input `{id}` was closed -> exiting"); break; } From 5982815968806f773af6a45e05dbdab8e5d048ed Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 14 Dec 2022 17:51:09 +0100 Subject: [PATCH 031/225] Don't block daemon when receivers cannot keep up This can easily cause deadlocks. --- binaries/daemon/src/main.rs | 67 +++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index faedde5b..0fbb64e0 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -20,6 +20,7 @@ use std::{ use tokio::{ net::TcpStream, sync::{mpsc, oneshot}, + time::timeout, }; use tokio_stream::{ wrappers::{ReceiverStream, TcpListenerStream}, @@ -278,23 +279,30 @@ impl Daemon { for (receiver_id, input_id) in local_receivers { if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { let drop_token = DropToken::generate(); - if channel - .send_async(daemon_messages::NodeEvent::Input { - id: input_id.clone(), - metadata: metadata.clone(), - data: Some(daemon_messages::InputData { - shared_memory_id: id.clone(), - drop_token: drop_token.clone(), - }), - }) - .await - .is_err() - { - closed.push(receiver_id); + let send_result = channel.send_async(daemon_messages::NodeEvent::Input { + id: input_id.clone(), + metadata: metadata.clone(), + data: Some(daemon_messages::InputData { + shared_memory_id: id.clone(), + drop_token: drop_token.clone(), + }), + }); + + match timeout(Duration::from_millis(10), send_result).await { + Ok(Ok(())) => { + // keep shared memory ptr in order to free it once all subscribers are done + self.sent_out_shared_memory + .insert(drop_token, memory.clone()); + } + Ok(Err(_)) => { + closed.push(receiver_id); + } + Err(_) => { + tracing::warn!( + "dropping input event `{receiver_id}/{input_id}` (send timeout)" + ); + } } - // keep shared memory ptr in order to free it once all subscribers are done - self.sent_out_shared_memory - .insert(drop_token, memory.clone()); } } for id in closed { @@ -368,16 +376,21 @@ impl Daemon { continue; }; - if channel - .send_async(daemon_messages::NodeEvent::Input { - id: input_id.clone(), - metadata: metadata.clone(), - data: None, - }) - .await - .is_err() - { - closed.push(receiver_id); + let send_result = channel.send_async(daemon_messages::NodeEvent::Input { + id: input_id.clone(), + metadata: metadata.clone(), + data: None, + }); + match timeout(Duration::from_millis(1), send_result).await { + Ok(Ok(())) => {} + Ok(Err(_)) => { + closed.push(receiver_id); + } + Err(_) => { + tracing::info!( + "dropping timer tick event for `{receiver_id}` (send timeout)" + ); + } } } for id in closed { @@ -428,6 +441,7 @@ pub struct RunningDataflow { type OutputId = (NodeId, DataId); type InputId = (NodeId, DataId); +#[derive(Debug)] pub enum Event { NewConnection(TcpStream), ConnectError(eyre::Report), @@ -458,6 +472,7 @@ pub enum DaemonNodeEvent { }, } +#[derive(Debug)] pub enum DoraEvent { Timer { dataflow_id: DataflowId, From 871a4bc3dc8191f29a74f56a47ec18ada4c580da Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 14 Dec 2022 17:52:12 +0100 Subject: [PATCH 032/225] Decrease timer frequency in rust dataflow example --- examples/rust-dataflow/dataflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/rust-dataflow/dataflow.yml b/examples/rust-dataflow/dataflow.yml index eb5e5995..4bf75ae9 100644 --- a/examples/rust-dataflow/dataflow.yml +++ b/examples/rust-dataflow/dataflow.yml @@ -8,7 +8,7 @@ nodes: build: cargo build -p rust-dataflow-example-node source: ../../target/debug/rust-dataflow-example-node inputs: - tick: dora/timer/millis/3 + tick: dora/timer/millis/10 outputs: - random # - id: runtime-node From ec797e6299918095e6739ab7cb08e81bcb6e2826 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 21 Dec 2022 13:08:45 +0100 Subject: [PATCH 033/225] Timers are handled in daemon now --- binaries/coordinator/src/run/mod.rs | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index e7ef3022..4dcff706 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -104,33 +104,6 @@ pub async fn spawn_dataflow( .await .wrap_err("failed to send spawn message to daemon")?; - // TODO - for interval in dora_timers { - let communication_config = communication_config.clone(); - // let mut communication = - // tokio::task::spawn_blocking(move || communication::init(&communication_config)) - // .await - // .wrap_err("failed to join communication layer init task")? - // .wrap_err("failed to init communication layer")?; - tokio::spawn(async move { - let topic = { - let duration = format_duration(interval); - format!("dora/timer/{duration}") - }; - let hlc = dora_message::uhlc::HLC::default(); - let mut stream = IntervalStream::new(tokio::time::interval(interval)); - while (stream.next().await).is_some() { - let metadata = dora_message::Metadata::new(hlc.new_timestamp()); - let data = metadata.serialize().unwrap(); - // communication - // .publisher(&topic) - // .unwrap() - // .publish(&data) - // .expect("failed to publish timer tick message"); - // todo!() - } - }); - } Ok(SpawnedDataflow { tasks: FuturesUnordered::new(), // TODO communication_config, From 2527ca0e856990f4c3d8a02486d039eadfdd3362 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 21 Dec 2022 13:42:03 +0100 Subject: [PATCH 034/225] Report spawn result from daemon to coordinator --- binaries/coordinator/src/run/mod.rs | 25 ++++++++++++----- binaries/daemon/src/coordinator.rs | 39 ++++++++++++++++++++++++--- binaries/daemon/src/main.rs | 14 +++++++--- libraries/core/src/daemon_messages.rs | 5 ++++ 4 files changed, 69 insertions(+), 14 deletions(-) diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 4dcff706..c4c947a8 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -1,10 +1,11 @@ -use crate::tcp_utils::tcp_send; +use crate::tcp_utils::{tcp_receive, tcp_send}; -use self::runtime::spawn_runtime_node; use dora_core::{ - config::{format_duration, CommunicationConfig, NodeId}, - daemon_messages::{DaemonCoordinatorEvent, SpawnDataflowNodes, SpawnNodeParams}, - descriptor::{self, collect_dora_timers, CoreNodeKind, Descriptor}, + config::{CommunicationConfig, NodeId}, + daemon_messages::{ + DaemonCoordinatorEvent, DaemonCoordinatorReply, SpawnDataflowNodes, SpawnNodeParams, + }, + descriptor::{collect_dora_timers, CoreNodeKind, Descriptor}, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; use futures::{stream::FuturesUnordered, StreamExt}; @@ -14,7 +15,6 @@ use std::{ path::Path, }; use tokio::net::TcpStream; -use tokio_stream::wrappers::IntervalStream; use uuid::Uuid; mod runtime; @@ -104,6 +104,19 @@ pub async fn spawn_dataflow( .await .wrap_err("failed to send spawn message to daemon")?; + // wait for reply + let reply_raw = tcp_receive(daemon_connection) + .await + .wrap_err("failed to receive spawn reply from daemon")?; + match serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize spawn reply from daemon")? + { + DaemonCoordinatorReply::SpawnResult(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to spawn dataflow")?, + } + tracing::info!("successfully spawned dataflow `{uuid}`"); + Ok(SpawnedDataflow { tasks: FuturesUnordered::new(), // TODO communication_config, diff --git a/binaries/daemon/src/coordinator.rs b/binaries/daemon/src/coordinator.rs index b846c697..c029de4a 100644 --- a/binaries/daemon/src/coordinator.rs +++ b/binaries/daemon/src/coordinator.rs @@ -2,13 +2,25 @@ use crate::{ tcp_utils::{tcp_receive, tcp_send}, DaemonCoordinatorEvent, }; -use dora_core::coordinator_messages::{CoordinatorRequest, RegisterResult}; +use dora_core::{ + coordinator_messages::{CoordinatorRequest, RegisterResult}, + daemon_messages::DaemonCoordinatorReply, +}; use eyre::{eyre, Context}; use std::{io::ErrorKind, net::SocketAddr}; -use tokio::{net::TcpStream, sync::mpsc}; +use tokio::{ + net::TcpStream, + sync::{mpsc, oneshot}, +}; use tokio_stream::{wrappers::ReceiverStream, Stream}; -pub async fn connect(addr: SocketAddr) -> eyre::Result> { +#[derive(Debug)] +pub struct CoordinatorEvent { + pub event: DaemonCoordinatorEvent, + pub reply_tx: oneshot::Sender, +} + +pub async fn connect(addr: SocketAddr) -> eyre::Result> { let mut stream = TcpStream::connect(addr) .await .wrap_err("failed to connect to dora-coordinator")?; @@ -49,13 +61,32 @@ pub async fn connect(addr: SocketAddr) -> eyre::Result {} Err(_) => { // receiving end of channel was closed break; } } + + let Ok(reply) = reply_rx.await else { + tracing::warn!("daemon sent no reply"); + continue; + }; + let serialized = match serde_json::to_vec(&reply) + .wrap_err("failed to serialize DaemonCoordinatorReply") + { + Ok(r) => r, + Err(err) => { + tracing::error!("{err:?}"); + continue; + } + }; + if let Err(err) = tcp_send(&mut stream, &serialized).await { + tracing::warn!("failed to send reply to coordinator: {err}"); + continue; + }; } }); diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 0fbb64e0..ed10bc94 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,8 +1,9 @@ +use coordinator::CoordinatorEvent; use dora_core::{ config::{DataId, InputMapping, NodeId}, daemon_messages::{ - self, ControlReply, DaemonCoordinatorEvent, DataflowId, DropEvent, DropToken, - SpawnDataflowNodes, + self, ControlReply, DaemonCoordinatorEvent, DaemonCoordinatorReply, DataflowId, DropEvent, + DropToken, SpawnDataflowNodes, }, topics::DORA_COORDINATOR_PORT_DEFAULT, }; @@ -111,7 +112,12 @@ impl Daemon { Event::ConnectError(err) => { tracing::warn!("{:?}", err.wrap_err("failed to connect")); } - Event::Coordinator(event) => self.handle_coordinator_event(event).await?, + Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { + let result = self.handle_coordinator_event(event).await; + let _ = reply_tx.send(DaemonCoordinatorReply::SpawnResult( + result.map_err(|err| format!("{err:?}")), + )); + } Event::Node { dataflow_id: dataflow, node_id, @@ -451,7 +457,7 @@ pub enum Event { event: DaemonNodeEvent, reply_sender: oneshot::Sender, }, - Coordinator(DaemonCoordinatorEvent), + Coordinator(CoordinatorEvent), Dora(DoraEvent), Drop(DropEvent), } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 54e0b3fd..361f0289 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -84,6 +84,11 @@ pub enum DaemonCoordinatorEvent { Spawn(SpawnDataflowNodes), } +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub enum DaemonCoordinatorReply { + SpawnResult(Result<(), String>), +} + pub type DataflowId = Uuid; #[derive(Debug, serde::Deserialize, serde::Serialize)] From b9aa065bd1ca7ac8b915a56cd558ec3b79910835 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 21 Dec 2022 14:02:48 +0100 Subject: [PATCH 035/225] Don't send 'dataflow finished' event immediately --- binaries/coordinator/src/lib.rs | 49 +++-------------------------- binaries/coordinator/src/run/mod.rs | 13 -------- 2 files changed, 5 insertions(+), 57 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 0bbdf3d3..76b631cc 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -11,12 +11,10 @@ use dora_core::{ use eyre::{bail, WrapErr}; use futures::StreamExt; use futures_concurrency::stream::Merge; -use run::{await_tasks, SpawnedDataflow}; +use run::SpawnedDataflow; use std::{ - collections::{hash_map, HashMap}, - io::ErrorKind, + collections::HashMap, path::{Path, PathBuf}, - time::Duration, }; use tokio::net::TcpStream; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; @@ -50,14 +48,10 @@ pub async fn run(args: Args) -> eyre::Result<()> { .with_file_name("dora-runtime") }); - let daemon_connections = &mut HashMap::new(); // TODO - match run_dataflow { Some(path) => { // start the given dataflow directly - run::run_dataflow(&path, &runtime_path, daemon_connections) - .await - .wrap_err_with(|| format!("failed to run dataflow at {}", path.display()))?; + todo!(); } None => { // start in daemon mode @@ -76,10 +70,6 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .unwrap_or_else(Event::DaemonConnectError) }); - let (dataflow_events_tx, dataflow_events) = tokio::sync::mpsc::channel(2); - let mut dataflow_events_tx = Some(dataflow_events_tx); - let dataflow_events = ReceiverStream::new(dataflow_events); - let (daemon_events_tx, daemon_events) = tokio::sync::mpsc::channel(2); let daemon_events = ReceiverStream::new(daemon_events); @@ -89,13 +79,7 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .wrap_err("failed to create control events")?, ); - let mut events = ( - new_daemon_connections, - daemon_events, - dataflow_events, - control_events, - ) - .merge(); + let mut events = (new_daemon_connections, daemon_events, control_events).merge(); let mut running_dataflows = HashMap::new(); let mut daemon_connections: HashMap<_, TcpStream> = HashMap::new(); @@ -171,7 +155,6 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { &dataflow_path, name, runtime_path, - &dataflow_events_tx, &mut daemon_connections, ) .await?; @@ -233,9 +216,6 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { control_events_abort.abort(); - // ensure that no new dataflows can be started - dataflow_events_tx = None; - // stop all running dataflows for &uuid in running_dataflows.keys() { stop_dataflow(&running_dataflows, uuid).await?; @@ -335,35 +315,16 @@ async fn start_dataflow( path: &Path, name: Option, runtime_path: &Path, - dataflow_events_tx: &Option>, daemon_connections: &mut HashMap, ) -> eyre::Result { // TODO: send Spawn message to daemon let runtime_path = runtime_path.to_owned(); - let dataflow_events_tx = match dataflow_events_tx { - Some(channel) => channel.clone(), - None => bail!("cannot start new dataflow after receiving stop command"), - }; + let SpawnedDataflow { uuid, communication_config, - tasks, } = spawn_dataflow(&runtime_path, path, daemon_connections).await?; - let path = path.to_owned(); - let task = async move { - let result = await_tasks(tasks) - .await - .wrap_err_with(|| format!("failed to run dataflow at {}", path.display())); - - let _ = dataflow_events_tx - .send(Event::Dataflow { - uuid, - event: DataflowEvent::Finished { result }, - }) - .await; - }; - tokio::spawn(task); Ok(RunningDataflow { uuid, name, diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index c4c947a8..5d5955e5 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -19,17 +19,6 @@ use uuid::Uuid; mod runtime; -pub async fn run_dataflow( - dataflow_path: &Path, - runtime: &Path, - daemon_connections: &mut HashMap, -) -> eyre::Result<()> { - let tasks = spawn_dataflow(runtime, dataflow_path, daemon_connections) - .await? - .tasks; - await_tasks(tasks).await -} - pub async fn spawn_dataflow( runtime: &Path, dataflow_path: &Path, @@ -118,7 +107,6 @@ pub async fn spawn_dataflow( tracing::info!("successfully spawned dataflow `{uuid}`"); Ok(SpawnedDataflow { - tasks: FuturesUnordered::new(), // TODO communication_config, uuid, }) @@ -127,7 +115,6 @@ pub async fn spawn_dataflow( pub struct SpawnedDataflow { pub uuid: Uuid, pub communication_config: CommunicationConfig, - pub tasks: FuturesUnordered>>, } pub async fn await_tasks( From 377703c325d996facc970adb893f7c579d891873 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 21 Dec 2022 15:19:35 +0100 Subject: [PATCH 036/225] Report finished dataflows from daemon to coordinator Makes the `dora list` command work. --- binaries/coordinator/src/lib.rs | 67 +++++++++++----------- binaries/coordinator/src/listener.rs | 21 ++++++- binaries/coordinator/src/run/mod.rs | 11 +++- binaries/daemon/src/coordinator.rs | 28 +++++++-- binaries/daemon/src/main.rs | 32 +++++++++-- libraries/core/src/coordinator_messages.rs | 18 +++++- 6 files changed, 128 insertions(+), 49 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 76b631cc..690647a7 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -13,7 +13,7 @@ use futures::StreamExt; use futures_concurrency::stream::Merge; use run::SpawnedDataflow; use std::{ - collections::HashMap, + collections::{BTreeSet, HashMap}, path::{Path, PathBuf}, }; use tokio::net::TcpStream; @@ -81,7 +81,7 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { let mut events = (new_daemon_connections, daemon_events, control_events).merge(); - let mut running_dataflows = HashMap::new(); + let mut running_dataflows: HashMap = HashMap::new(); let mut daemon_connections: HashMap<_, TcpStream> = HashMap::new(); while let Some(event) = events.next().await { @@ -117,15 +117,27 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { } } Event::Dataflow { uuid, event } => match event { - DataflowEvent::Finished { result } => { - running_dataflows.remove(&uuid); - match result { - Ok(()) => { - tracing::info!("dataflow `{uuid}` finished successfully"); + DataflowEvent::DataflowFinishedOnMachine { machine_id, result } => { + match running_dataflows.entry(uuid) { + std::collections::hash_map::Entry::Occupied(mut entry) => { + entry.get_mut().machines.remove(&machine_id); + match result { + Ok(()) => { + tracing::info!("dataflow `{uuid}` finished successfully on machine `{machine_id}`"); + } + Err(err) => { + let err = + err.wrap_err(format!("error occured in dataflow `{uuid}` on machine `{machine_id}`")); + tracing::error!("{err:?}"); + } + } + if entry.get_mut().machines.is_empty() { + entry.remove(); + tracing::info!("dataflow `{uuid}` finished"); + } } - Err(err) => { - let err = err.wrap_err(format!("error occured in dataflow `{uuid}`")); - tracing::error!("{err:?}"); + std::collections::hash_map::Entry::Vacant(_) => { + tracing::warn!("dataflow not running on DataflowFinishedOnMachine"); } } } @@ -225,7 +237,7 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { } ControlRequest::List => { let mut dataflows: Vec<_> = running_dataflows.values().collect(); - dataflows.sort(); + dataflows.sort_by_key(|d| (&d.name, d.uuid)); let reply = ListDataflowResult::Ok { dataflows: dataflows @@ -256,36 +268,18 @@ struct RunningDataflow { name: Option, uuid: Uuid, communication_config: CommunicationConfig, + /// The IDs of the machines that the dataflow is running on. + machines: BTreeSet, } impl PartialEq for RunningDataflow { fn eq(&self, other: &Self) -> bool { - self.name == other.name && self.uuid == other.uuid + self.name == other.name && self.uuid == other.uuid && self.machines == other.machines } } impl Eq for RunningDataflow {} -impl PartialOrd for RunningDataflow { - fn partial_cmp(&self, other: &Self) -> Option { - match self.name.partial_cmp(&other.name) { - Some(core::cmp::Ordering::Equal) => {} - ord => return ord, - } - self.uuid.partial_cmp(&other.uuid) - } -} - -impl Ord for RunningDataflow { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match self.name.cmp(&other.name) { - core::cmp::Ordering::Equal => {} - ord => return ord, - } - self.uuid.cmp(&other.uuid) - } -} - async fn stop_dataflow( running_dataflows: &HashMap, uuid: Uuid, @@ -317,18 +311,18 @@ async fn start_dataflow( runtime_path: &Path, daemon_connections: &mut HashMap, ) -> eyre::Result { - // TODO: send Spawn message to daemon - let runtime_path = runtime_path.to_owned(); let SpawnedDataflow { uuid, communication_config, + machines, } = spawn_dataflow(&runtime_path, path, daemon_connections).await?; Ok(RunningDataflow { uuid, name, communication_config, + machines, }) } @@ -343,7 +337,10 @@ pub enum Event { #[derive(Debug)] pub enum DataflowEvent { - Finished { result: eyre::Result<()> }, + DataflowFinishedOnMachine { + machine_id: String, + result: eyre::Result<()>, + }, } #[derive(Debug)] diff --git a/binaries/coordinator/src/listener.rs b/binaries/coordinator/src/listener.rs index 63164866..a7cd938b 100644 --- a/binaries/coordinator/src/listener.rs +++ b/binaries/coordinator/src/listener.rs @@ -1,6 +1,6 @@ -use crate::{tcp_utils::tcp_receive, DaemonEvent, Event}; +use crate::{tcp_utils::tcp_receive, DaemonEvent, DataflowEvent, Event}; use dora_core::coordinator_messages; -use eyre::Context; +use eyre::{eyre, Context}; use std::{io::ErrorKind, net::Ipv4Addr}; use tokio::{ net::{TcpListener, TcpStream}, @@ -50,6 +50,23 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende let _ = events_tx.send(Event::Daemon(event)).await; break; } + coordinator_messages::CoordinatorRequest::Event { machine_id, event } => match event { + coordinator_messages::DaemonEvent::AllNodesFinished { + dataflow_id, + result, + } => { + let event = Event::Dataflow { + uuid: dataflow_id, + event: DataflowEvent::DataflowFinishedOnMachine { + machine_id, + result: result.map_err(|e| eyre!(e)), + }, + }; + if events_tx.send(event).await.is_err() { + break; + } + } + }, }; } } diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 5d5955e5..73d6cac7 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -10,7 +10,7 @@ use dora_core::{ use eyre::{bail, eyre, ContextCompat, WrapErr}; use futures::{stream::FuturesUnordered, StreamExt}; use std::{ - collections::{BTreeMap, HashMap}, + collections::{BTreeMap, BTreeSet, HashMap}, env::consts::EXE_EXTENSION, path::Path, }; @@ -86,8 +86,13 @@ pub async fn spawn_dataflow( nodes: custom_nodes, }; let message = serde_json::to_vec(&DaemonCoordinatorEvent::Spawn(spawn_command))?; + + // TODO allow partitioning a dataflow across multiple machines + let machine_id = ""; + let machines = [machine_id.to_owned()].into(); + let daemon_connection = daemon_connections - .get_mut("") + .get_mut(machine_id) .wrap_err("no daemon connection")?; // TODO: take from dataflow spec tcp_send(daemon_connection, &message) .await @@ -109,12 +114,14 @@ pub async fn spawn_dataflow( Ok(SpawnedDataflow { communication_config, uuid, + machines, }) } pub struct SpawnedDataflow { pub uuid: Uuid, pub communication_config: CommunicationConfig, + pub machines: BTreeSet, } pub async fn await_tasks( diff --git a/binaries/daemon/src/coordinator.rs b/binaries/daemon/src/coordinator.rs index c029de4a..99477123 100644 --- a/binaries/daemon/src/coordinator.rs +++ b/binaries/daemon/src/coordinator.rs @@ -3,7 +3,7 @@ use crate::{ DaemonCoordinatorEvent, }; use dora_core::{ - coordinator_messages::{CoordinatorRequest, RegisterResult}, + coordinator_messages::{CoordinatorRequest, DaemonEvent, RegisterResult}, daemon_messages::DaemonCoordinatorReply, }; use eyre::{eyre, Context}; @@ -20,16 +20,17 @@ pub struct CoordinatorEvent { pub reply_tx: oneshot::Sender, } -pub async fn connect(addr: SocketAddr) -> eyre::Result> { +pub async fn register( + addr: SocketAddr, + machine_id: String, +) -> eyre::Result> { let mut stream = TcpStream::connect(addr) .await .wrap_err("failed to connect to dora-coordinator")?; stream .set_nodelay(true) .wrap_err("failed to set TCP_NODELAY")?; - let register = serde_json::to_vec(&CoordinatorRequest::Register { - machine_id: String::new(), // TODO - })?; + let register = serde_json::to_vec(&CoordinatorRequest::Register { machine_id })?; tcp_send(&mut stream, ®ister) .await .wrap_err("failed to send register request to dora-coordinator")?; @@ -92,3 +93,20 @@ pub async fn connect(addr: SocketAddr) -> eyre::Result eyre::Result<()> { + let mut stream = TcpStream::connect(addr) + .await + .wrap_err("failed to connect to dora-coordinator")?; + stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; + let msg = serde_json::to_vec(&CoordinatorRequest::Event { machine_id, event })?; + tcp_send(&mut stream, &msg) + .await + .wrap_err("failed to send event to dora-coordinator") +} diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index ed10bc94..2eca498d 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,6 +1,7 @@ use coordinator::CoordinatorEvent; use dora_core::{ config::{DataId, InputMapping, NodeId}, + coordinator_messages::DaemonEvent, daemon_messages::{ self, ControlReply, DaemonCoordinatorEvent, DaemonCoordinatorReply, DataflowId, DropEvent, DropToken, SpawnDataflowNodes, @@ -47,7 +48,9 @@ async fn run() -> eyre::Result<()> { let localhost = Ipv4Addr::new(127, 0, 0, 1); let coordinator_socket = (localhost, DORA_COORDINATOR_PORT_DEFAULT); - Daemon::run(coordinator_socket.into()).await + let machine_id = String::new(); // TODO + + Daemon::run(coordinator_socket.into(), machine_id).await } struct Daemon { @@ -58,12 +61,15 @@ struct Daemon { running: HashMap, dora_events_tx: mpsc::Sender, + + coordinator_addr: SocketAddr, + machine_id: String, } impl Daemon { - pub async fn run(coordinator_addr: SocketAddr) -> eyre::Result<()> { + pub async fn run(coordinator_addr: SocketAddr, machine_id: String) -> eyre::Result<()> { // connect to the coordinator - let coordinator_events = coordinator::connect(coordinator_addr) + let coordinator_events = coordinator::register(coordinator_addr, machine_id.clone()) .await .wrap_err("failed to connect to dora-coordinator")? .map(Event::Coordinator); @@ -88,6 +94,8 @@ impl Daemon { sent_out_shared_memory: Default::default(), running: HashMap::new(), dora_events_tx, + coordinator_addr, + machine_id, }; let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); let events = (coordinator_events, new_connections, dora_events).merge(); @@ -352,7 +360,23 @@ impl Daemon { dataflow.subscribe_channels.remove(&node_id); if dataflow.subscribe_channels.is_empty() { - tracing::info!("Dataflow `{dataflow_id}` finished"); + tracing::info!( + "Dataflow `{dataflow_id}` finished on machine `{}`", + self.machine_id + ); + if coordinator::send_event( + self.coordinator_addr, + self.machine_id.clone(), + DaemonEvent::AllNodesFinished { + dataflow_id, + result: Ok(()), + }, + ) + .await + .is_err() + { + tracing::warn!("failed to report dataflow finish to coordinator"); + } self.running.remove(&dataflow_id); } } diff --git a/libraries/core/src/coordinator_messages.rs b/libraries/core/src/coordinator_messages.rs index 5c85e754..48618b3c 100644 --- a/libraries/core/src/coordinator_messages.rs +++ b/libraries/core/src/coordinator_messages.rs @@ -1,8 +1,24 @@ use eyre::eyre; +use crate::daemon_messages::DataflowId; + #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum CoordinatorRequest { - Register { machine_id: String }, + Register { + machine_id: String, + }, + Event { + machine_id: String, + event: DaemonEvent, + }, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub enum DaemonEvent { + AllNodesFinished { + dataflow_id: DataflowId, + result: Result<(), String>, + }, } #[derive(Debug, serde::Serialize, serde::Deserialize)] From 2ed7db1b0ee6f5545d9afbf903ccee64d963b9fe Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 21 Dec 2022 15:34:28 +0100 Subject: [PATCH 037/225] Make `dora stop` command working with new daemon design --- binaries/coordinator/src/lib.rs | 68 ++++++++++++++++++--------- binaries/coordinator/src/run/mod.rs | 1 + binaries/daemon/src/main.rs | 12 +++++ libraries/core/src/daemon_messages.rs | 2 + 4 files changed, 62 insertions(+), 21 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 690647a7..07a3bec7 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -1,14 +1,18 @@ -use crate::{run::spawn_dataflow, tcp_utils::tcp_send}; +use crate::{ + run::spawn_dataflow, + tcp_utils::{tcp_receive, tcp_send}, +}; use control::ControlEvent; use dora_core::{ config::CommunicationConfig, coordinator_messages::RegisterResult, + daemon_messages::{DaemonCoordinatorEvent, DaemonCoordinatorReply}, topics::{ control_socket_addr, ControlRequest, DataflowId, ListDataflowResult, StartDataflowResult, StopDataflowResult, DORA_COORDINATOR_PORT_DEFAULT, }, }; -use eyre::{bail, WrapErr}; +use eyre::{bail, eyre, ContextCompat, WrapErr}; use futures::StreamExt; use futures_concurrency::stream::Merge; use run::SpawnedDataflow; @@ -187,7 +191,12 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { } ControlRequest::Stop { dataflow_uuid } => { let stop = async { - stop_dataflow(&running_dataflows, dataflow_uuid).await?; + stop_dataflow( + &running_dataflows, + dataflow_uuid, + &mut daemon_connections, + ) + .await?; Result::<_, eyre::Report>::Ok(()) }; let reply = match stop.await { @@ -213,7 +222,12 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { bail!("multiple dataflows found with name `{name}`"); }; - stop_dataflow(&running_dataflows, dataflow_uuid).await?; + stop_dataflow( + &running_dataflows, + dataflow_uuid, + &mut daemon_connections, + ) + .await?; Result::<_, eyre::Report>::Ok(()) }; let reply = match stop.await { @@ -230,7 +244,8 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { // stop all running dataflows for &uuid in running_dataflows.keys() { - stop_dataflow(&running_dataflows, uuid).await?; + stop_dataflow(&running_dataflows, uuid, &mut daemon_connections) + .await?; } b"ok".as_slice().into() @@ -283,25 +298,36 @@ impl Eq for RunningDataflow {} async fn stop_dataflow( running_dataflows: &HashMap, uuid: Uuid, + daemon_connections: &mut HashMap, ) -> eyre::Result<()> { - let communication_config = match running_dataflows.get(&uuid) { - Some(dataflow) => dataflow.communication_config.clone(), - None => bail!("No running dataflow found with UUID `{uuid}`"), + let Some(dataflow) = running_dataflows.get(&uuid) else { + bail!("No running dataflow found with UUID `{uuid}`") }; + let message = serde_json::to_vec(&DaemonCoordinatorEvent::StopDataflow { dataflow_id: uuid })?; + + for machine_id in &dataflow.machines { + let daemon_connection = daemon_connections + .get_mut(machine_id) + .wrap_err("no daemon connection")?; // TODO: take from dataflow spec + tcp_send(daemon_connection, &message) + .await + .wrap_err("failed to send stop message to daemon")?; + + // wait for reply + let reply_raw = tcp_receive(daemon_connection) + .await + .wrap_err("failed to receive stop reply from daemon")?; + match serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize stop reply from daemon")? + { + DaemonCoordinatorReply::StopResult(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to stop dataflow")?, + _ => bail!("unexpected reply"), + } + } + tracing::info!("successfully stoped dataflow `{uuid}`"); - todo!(); - // let mut communication = - // tokio::task::spawn_blocking(move || communication::init(&communication_config)) - // .await - // .wrap_err("failed to join communication layer init task")? - // .wrap_err("failed to init communication layer")?; - // tracing::info!("sending stop message to dataflow `{uuid}`"); - // let manual_stop_publisher = manual_stop_publisher(communication.as_mut())?; - // tokio::task::spawn_blocking(move || manual_stop_publisher()) - // .await - // .wrap_err("failed to join stop publish task")? - // .map_err(|err| eyre!(err)) - // .wrap_err("failed to send stop message")?; Ok(()) } diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 73d6cac7..dddb01c4 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -108,6 +108,7 @@ pub async fn spawn_dataflow( DaemonCoordinatorReply::SpawnResult(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to spawn dataflow")?, + _ => bail!("unexpected reply"), } tracing::info!("successfully spawned dataflow `{uuid}`"); diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 2eca498d..52f21ec4 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -224,6 +224,18 @@ impl Daemon { dataflow._timer_handles.push(handle); } + Ok(()) + } + DaemonCoordinatorEvent::StopDataflow { dataflow_id } => { + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + + for channel in dataflow.subscribe_channels.values_mut() { + let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; + } + Ok(()) } } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 361f0289..68a81327 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -82,11 +82,13 @@ pub struct InputData { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorEvent { Spawn(SpawnDataflowNodes), + StopDataflow { dataflow_id: DataflowId }, } #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorReply { SpawnResult(Result<(), String>), + StopResult(Result<(), String>), } pub type DataflowId = Uuid; From dbe81068c4a82eff77d534505c9e7696daf38aab Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 21 Dec 2022 17:09:24 +0100 Subject: [PATCH 038/225] Update Python node API for new daemon design --- Cargo.lock | 1 - apis/python/node/Cargo.toml | 2 +- apis/python/node/src/lib.rs | 69 +++++++++++++++++++++---------------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a806a614..da86d159 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1077,7 +1077,6 @@ version = "0.1.1-2" dependencies = [ "dora-node-api", "dora-operator-api-python", - "dora-runtime", "eyre", "flume", "pyo3", diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index fdce146c..a3c23a77 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -13,7 +13,7 @@ pyo3 = { version = "0.16", features = ["eyre", "abi3-py37"] } eyre = "0.6" serde_yaml = "0.8.23" flume = "0.10.14" -dora-runtime = { path = "../../../binaries/runtime" } +# dora-runtime = { path = "../../../binaries/runtime" } [lib] name = "dora" diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index b729b782..61c102aa 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -1,9 +1,14 @@ #![allow(clippy::borrow_deref_ref)] // clippy warns about code generated by #[pymethods] -use dora_node_api::{dora_core::config::NodeId, DoraNode, Input}; +use std::ops::Deref; + +use dora_node_api::{ + daemon::{Event, EventStream}, + dora_core::config::NodeId, + DoraNode, +}; use dora_operator_api_python::{metadata_to_pydict, pydict_to_metadata}; use eyre::{Context, Result}; -use flume::Receiver; use pyo3::{ prelude::*, types::{PyBytes, PyDict}, @@ -12,20 +17,27 @@ use pyo3::{ #[pyclass] pub struct Node { id: NodeId, - inputs: Receiver, + events: EventStream, node: DoraNode, } -pub struct PyInput(Input); +pub struct PyInput<'a>(Event<'a>); -impl IntoPy for PyInput { +impl IntoPy for PyInput<'_> { fn into_py(self, py: Python) -> PyObject { - ( - self.0.id.to_string(), - PyBytes::new(py, &self.0.data()), - metadata_to_pydict(self.0.metadata(), py), - ) - .into_py(py) + match self.0 { + Event::Stop => ("stop").into_py(py), + Event::Input { id, metadata, data } => ( + "input", + id.to_string(), + PyBytes::new(py, data.as_deref().unwrap_or_default()), + metadata_to_pydict(&metadata, py), + ) + .into_py(py), + Event::InputClosed { id } => ("input-closed", id.deref()).into_py(py), + Event::Error(err) => ("error", err).into_py(py), + other => ("unknown", format!("{other:?}")).into_py(py), + } } } @@ -39,10 +51,9 @@ impl Node { serde_yaml::from_str(&raw).context("failed to deserialize operator config")? }; - let mut node = DoraNode::init_from_env()?; - let inputs = node.inputs()?; + let (node, events) = DoraNode::init_from_env()?; - Ok(Node { id, inputs, node }) + Ok(Node { id, events, node }) } #[allow(clippy::should_implement_trait)] @@ -51,7 +62,7 @@ impl Node { } pub fn __next__(&mut self) -> PyResult> { - Ok(self.inputs.recv().ok().map(PyInput)) + Ok(self.events.recv().map(PyInput)) } fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { @@ -67,7 +78,7 @@ impl Node { let data = data.as_bytes(); let metadata = pydict_to_metadata(metadata)?; self.node - .send_output(&output_id.into(), metadata, data.len(), |out| { + .send_output(output_id.into(), metadata, data.len(), |out| { out.copy_from_slice(data); }) .wrap_err("Could not send output") @@ -78,17 +89,17 @@ impl Node { } } -#[pyfunction] -fn start_runtime() -> Result<()> { - dora_runtime::main() - .wrap_err("Python Dora Runtime failed.") - .unwrap(); - Ok(()) -} +// #[pyfunction] +// fn start_runtime() -> Result<()> { +// dora_runtime::main() +// .wrap_err("Python Dora Runtime failed.") +// .unwrap(); +// Ok(()) +// } -#[pymodule] -fn dora(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(start_runtime, m)?)?; - m.add_class::().unwrap(); - Ok(()) -} +// #[pymodule] +// fn dora(_py: Python, m: &PyModule) -> PyResult<()> { +// m.add_function(wrap_pyfunction!(start_runtime, m)?)?; +// m.add_class::().unwrap(); +// Ok(()) +// } From 34e4e62188e70bac70b115cfccc51c8aec81805b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 12:41:15 +0100 Subject: [PATCH 039/225] Split off library from daemon binary To allow usage for tests and examples. --- binaries/daemon/src/lib.rs | 513 +++++++++++++++++++++++++++++++++++ binaries/daemon/src/main.rs | 516 +----------------------------------- 2 files changed, 517 insertions(+), 512 deletions(-) create mode 100644 binaries/daemon/src/lib.rs diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs new file mode 100644 index 00000000..1b9bd544 --- /dev/null +++ b/binaries/daemon/src/lib.rs @@ -0,0 +1,513 @@ +use coordinator::CoordinatorEvent; +use dora_core::{ + config::{DataId, InputMapping, NodeId}, + coordinator_messages::DaemonEvent, + daemon_messages::{ + self, ControlReply, DaemonCoordinatorEvent, DaemonCoordinatorReply, DataflowId, DropEvent, + DropToken, SpawnDataflowNodes, + }, +}; +use dora_message::uhlc::HLC; +use eyre::{bail, eyre, Context, ContextCompat}; +use futures::FutureExt; +use futures_concurrency::stream::Merge; +use shared_memory::{Shmem, ShmemConf}; +use std::{ + collections::{BTreeMap, BTreeSet, HashMap}, + net::SocketAddr, + rc::Rc, + time::Duration, +}; +use tokio::{ + net::TcpStream, + sync::{mpsc, oneshot}, + time::timeout, +}; +use tokio_stream::{ + wrappers::{ReceiverStream, TcpListenerStream}, + Stream, StreamExt, +}; + +mod coordinator; +mod listener; +mod spawn; +mod tcp_utils; + +pub struct Daemon { + port: u16, + uninit_shared_memory: HashMap, Shmem)>, + sent_out_shared_memory: HashMap>, + + running: HashMap, + + dora_events_tx: mpsc::Sender, + + coordinator_addr: SocketAddr, + machine_id: String, +} + +impl Daemon { + pub async fn run(coordinator_addr: SocketAddr, machine_id: String) -> eyre::Result<()> { + // connect to the coordinator + let coordinator_events = coordinator::register(coordinator_addr, machine_id.clone()) + .await + .wrap_err("failed to connect to dora-coordinator")? + .map(Event::Coordinator); + + // create listener for node connection + let listener = listener::create_listener().await?; + let port = listener + .local_addr() + .wrap_err("failed to get local addr of listener")? + .port(); + let new_connections = TcpListenerStream::new(listener).map(|c| { + c.map(Event::NewConnection) + .wrap_err("failed to open connection") + .unwrap_or_else(Event::ConnectError) + }); + tracing::info!("Listening for node connections on 127.0.0.1:{port}"); + + let (dora_events_tx, dora_events_rx) = mpsc::channel(5); + let daemon = Self { + port, + uninit_shared_memory: Default::default(), + sent_out_shared_memory: Default::default(), + running: HashMap::new(), + dora_events_tx, + coordinator_addr, + machine_id, + }; + let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); + let events = (coordinator_events, new_connections, dora_events).merge(); + daemon.run_inner(events).await + } + + async fn run_inner( + mut self, + incoming_events: impl Stream + Unpin, + ) -> eyre::Result<()> { + let (node_events_tx, node_events_rx) = mpsc::channel(10); + let node_events = ReceiverStream::new(node_events_rx); + + let mut events = (incoming_events, node_events).merge(); + + while let Some(event) = events.next().await { + match event { + Event::NewConnection(connection) => { + let events_tx = node_events_tx.clone(); + tokio::spawn(listener::handle_connection(connection, events_tx)); + } + Event::ConnectError(err) => { + tracing::warn!("{:?}", err.wrap_err("failed to connect")); + } + Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { + let result = self.handle_coordinator_event(event).await; + let _ = reply_tx.send(DaemonCoordinatorReply::SpawnResult( + result.map_err(|err| format!("{err:?}")), + )); + } + Event::Node { + dataflow_id: dataflow, + node_id, + event, + reply_sender, + } => { + self.handle_node_event(event, dataflow, node_id, reply_sender) + .await? + } + Event::Dora(event) => self.handle_dora_event(event).await?, + Event::Drop(DropEvent { token }) => { + match self.sent_out_shared_memory.remove(&token) { + Some(rc) => { + if let Ok(_shmem) = Rc::try_unwrap(rc) { + tracing::trace!( + "freeing shared memory after receiving last drop token" + ) + } + } + None => tracing::warn!("received unknown drop token {token:?}"), + } + } + } + } + + Ok(()) + } + + async fn handle_coordinator_event( + &mut self, + event: DaemonCoordinatorEvent, + ) -> eyre::Result<()> { + match event { + DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { + self.spawn_dataflow(dataflow_id, nodes).await + } + DaemonCoordinatorEvent::StopDataflow { dataflow_id } => { + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + + for channel in dataflow.subscribe_channels.values_mut() { + let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; + } + + Ok(()) + } + } + } + + async fn spawn_dataflow( + &mut self, + dataflow_id: uuid::Uuid, + nodes: BTreeMap, + ) -> eyre::Result<()> { + let dataflow = match self.running.entry(dataflow_id) { + std::collections::hash_map::Entry::Vacant(entry) => entry.insert(Default::default()), + std::collections::hash_map::Entry::Occupied(_) => { + bail!("there is already a running dataflow with ID `{dataflow_id}`") + } + }; + for (node_id, params) in nodes { + for (input_id, mapping) in params.node.run_config.inputs.clone() { + match mapping { + InputMapping::User(mapping) => { + if mapping.operator.is_some() { + bail!("operators are not supported"); + } + dataflow + .mappings + .entry((mapping.source, mapping.output)) + .or_default() + .insert((node_id.clone(), input_id)); + } + InputMapping::Timer { interval } => { + dataflow + .timers + .entry(interval) + .or_default() + .insert((node_id.clone(), input_id)); + } + } + } + + spawn::spawn_node(dataflow_id, params, self.port, self.dora_events_tx.clone()) + .await + .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; + } + for interval in dataflow.timers.keys().copied() { + let events_tx = self.dora_events_tx.clone(); + let task = async move { + let mut interval_stream = tokio::time::interval(interval); + let hlc = HLC::default(); + loop { + interval_stream.tick().await; + + let event = DoraEvent::Timer { + dataflow_id, + interval, + metadata: dora_message::Metadata::from_parameters( + hlc.new_timestamp(), + Default::default(), + ), + }; + if events_tx.send(event).await.is_err() { + break; + } + } + }; + let (task, handle) = task.remote_handle(); + tokio::spawn(task); + dataflow._timer_handles.push(handle); + } + Ok(()) + } + + async fn handle_node_event( + &mut self, + event: DaemonNodeEvent, + dataflow_id: DataflowId, + node_id: NodeId, + reply_sender: oneshot::Sender, + ) -> Result<(), eyre::ErrReport> { + match event { + DaemonNodeEvent::Subscribe { event_sender } => { + let result = match self.running.get_mut(&dataflow_id) { + Some(dataflow) => { + dataflow.subscribe_channels.insert(node_id, event_sender); + Ok(()) + } + None => Err(format!("no running dataflow with ID `{dataflow_id}`")), + }; + let _ = reply_sender.send(ControlReply::Result(result)); + } + DaemonNodeEvent::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => { + let memory = ShmemConf::new() + .size(data_len) + .create() + .wrap_err("failed to allocate shared memory")?; + let id = memory.get_os_id().to_owned(); + self.uninit_shared_memory + .insert(id.clone(), (output_id, metadata, memory)); + + let reply = ControlReply::PreparedMessage { + shared_memory_id: id.clone(), + }; + if reply_sender.send(reply).is_err() { + // free shared memory slice again + self.uninit_shared_memory.remove(&id); + } + } + DaemonNodeEvent::SendOutMessage { id } => { + let (output_id, metadata, memory) = self + .uninit_shared_memory + .remove(&id) + .ok_or_else(|| eyre!("invalid shared memory id"))?; + + let memory = Rc::new(memory); + + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + + // figure out receivers from dataflow graph + let empty_set = BTreeSet::new(); + let local_receivers = dataflow + .mappings + .get(&(node_id, output_id)) + .unwrap_or(&empty_set); + + // send shared memory ID to all local receivers + let mut closed = Vec::new(); + for (receiver_id, input_id) in local_receivers { + if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { + let drop_token = DropToken::generate(); + let send_result = channel.send_async(daemon_messages::NodeEvent::Input { + id: input_id.clone(), + metadata: metadata.clone(), + data: Some(daemon_messages::InputData { + shared_memory_id: id.clone(), + drop_token: drop_token.clone(), + }), + }); + + match timeout(Duration::from_millis(10), send_result).await { + Ok(Ok(())) => { + // keep shared memory ptr in order to free it once all subscribers are done + self.sent_out_shared_memory + .insert(drop_token, memory.clone()); + } + Ok(Err(_)) => { + closed.push(receiver_id); + } + Err(_) => { + tracing::warn!( + "dropping input event `{receiver_id}/{input_id}` (send timeout)" + ); + } + } + } + } + for id in closed { + dataflow.subscribe_channels.remove(id); + } + + // TODO send `data` via network to all remove receivers + let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + + let _ = reply_sender.send(ControlReply::Result(Ok(()))); + } + DaemonNodeEvent::Stopped => { + tracing::info!("Stopped: {dataflow_id}/{node_id}"); + + let _ = reply_sender.send(ControlReply::Result(Ok(()))); + + // notify downstream nodes + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + let downstream_nodes: BTreeSet<_> = dataflow + .mappings + .iter() + .filter(|((source_id, _), _)| source_id == &node_id) + .flat_map(|(_, v)| v) + .collect(); + for (receiver_id, input_id) in downstream_nodes { + let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { + continue; + }; + + let _ = channel + .send_async(daemon_messages::NodeEvent::InputClosed { + id: input_id.clone(), + }) + .await; + } + + // TODO: notify remote nodes + + dataflow.subscribe_channels.remove(&node_id); + if dataflow.subscribe_channels.is_empty() { + tracing::info!( + "Dataflow `{dataflow_id}` finished on machine `{}`", + self.machine_id + ); + if coordinator::send_event( + self.coordinator_addr, + self.machine_id.clone(), + DaemonEvent::AllNodesFinished { + dataflow_id, + result: Ok(()), + }, + ) + .await + .is_err() + { + tracing::warn!("failed to report dataflow finish to coordinator"); + } + self.running.remove(&dataflow_id); + } + } + } + Ok(()) + } + + async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { + match event { + DoraEvent::Timer { + dataflow_id, + interval, + metadata, + } => { + let Some(dataflow) = self.running.get_mut(&dataflow_id) else { + tracing::warn!("Timer event for unknown dataflow `{dataflow_id}`"); + return Ok(()) + }; + + let Some(subscribers) = dataflow.timers.get(&interval) else { + return Ok(()); + }; + + let mut closed = Vec::new(); + for (receiver_id, input_id) in subscribers { + let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { + continue; + }; + + let send_result = channel.send_async(daemon_messages::NodeEvent::Input { + id: input_id.clone(), + metadata: metadata.clone(), + data: None, + }); + match timeout(Duration::from_millis(1), send_result).await { + Ok(Ok(())) => {} + Ok(Err(_)) => { + closed.push(receiver_id); + } + Err(_) => { + tracing::info!( + "dropping timer tick event for `{receiver_id}` (send timeout)" + ); + } + } + } + for id in closed { + dataflow.subscribe_channels.remove(id); + } + } + DoraEvent::SpawnedNodeResult { + dataflow_id, + node_id, + result, + } => { + if self + .running + .get(&dataflow_id) + .and_then(|d| d.subscribe_channels.get(&node_id)) + .is_some() + { + tracing::warn!( + "node `{dataflow_id}/{node_id}` finished without sending `Stopped` message" + ); + } + match result { + Ok(()) => { + tracing::info!("node {dataflow_id}/{node_id} finished"); + } + Err(err) => { + tracing::error!( + "{:?}", + err.wrap_err(format!("error in node `{dataflow_id}/{node_id}`")) + ); + } + } + } + } + Ok(()) + } +} + +#[derive(Default)] +pub struct RunningDataflow { + subscribe_channels: HashMap>, + mappings: HashMap>, + timers: BTreeMap>, + /// Keep handles to all timer tasks of this dataflow to cancel them on drop. + _timer_handles: Vec>, +} + +type OutputId = (NodeId, DataId); +type InputId = (NodeId, DataId); + +#[derive(Debug)] +pub enum Event { + NewConnection(TcpStream), + ConnectError(eyre::Report), + Node { + dataflow_id: DataflowId, + node_id: NodeId, + event: DaemonNodeEvent, + reply_sender: oneshot::Sender, + }, + Coordinator(CoordinatorEvent), + Dora(DoraEvent), + Drop(DropEvent), +} + +#[derive(Debug)] +pub enum DaemonNodeEvent { + PrepareOutputMessage { + output_id: DataId, + metadata: dora_message::Metadata<'static>, + data_len: usize, + }, + SendOutMessage { + id: MessageId, + }, + Stopped, + Subscribe { + event_sender: flume::Sender, + }, +} + +#[derive(Debug)] +pub enum DoraEvent { + Timer { + dataflow_id: DataflowId, + interval: Duration, + metadata: dora_message::Metadata<'static>, + }, + SpawnedNodeResult { + dataflow_id: DataflowId, + node_id: NodeId, + result: eyre::Result<()>, + }, +} + +type MessageId = String; diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 52f21ec4..fb8a3c78 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,38 +1,7 @@ -use coordinator::CoordinatorEvent; -use dora_core::{ - config::{DataId, InputMapping, NodeId}, - coordinator_messages::DaemonEvent, - daemon_messages::{ - self, ControlReply, DaemonCoordinatorEvent, DaemonCoordinatorReply, DataflowId, DropEvent, - DropToken, SpawnDataflowNodes, - }, - topics::DORA_COORDINATOR_PORT_DEFAULT, -}; -use dora_message::uhlc::HLC; -use eyre::{bail, eyre, Context, ContextCompat}; -use futures::FutureExt; -use futures_concurrency::stream::Merge; -use shared_memory::{Shmem, ShmemConf}; -use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, - net::{Ipv4Addr, SocketAddr}, - rc::Rc, - time::Duration, -}; -use tokio::{ - net::TcpStream, - sync::{mpsc, oneshot}, - time::timeout, -}; -use tokio_stream::{ - wrappers::{ReceiverStream, TcpListenerStream}, - Stream, StreamExt, -}; - -mod coordinator; -mod listener; -mod spawn; -mod tcp_utils; +use dora_core::topics::DORA_COORDINATOR_PORT_DEFAULT; +use dora_daemon::Daemon; +use eyre::Context; +use std::net::Ipv4Addr; #[tokio::main] async fn main() -> eyre::Result<()> { @@ -53,483 +22,6 @@ async fn run() -> eyre::Result<()> { Daemon::run(coordinator_socket.into(), machine_id).await } -struct Daemon { - port: u16, - uninit_shared_memory: HashMap, Shmem)>, - sent_out_shared_memory: HashMap>, - - running: HashMap, - - dora_events_tx: mpsc::Sender, - - coordinator_addr: SocketAddr, - machine_id: String, -} - -impl Daemon { - pub async fn run(coordinator_addr: SocketAddr, machine_id: String) -> eyre::Result<()> { - // connect to the coordinator - let coordinator_events = coordinator::register(coordinator_addr, machine_id.clone()) - .await - .wrap_err("failed to connect to dora-coordinator")? - .map(Event::Coordinator); - - // create listener for node connection - let listener = listener::create_listener().await?; - let port = listener - .local_addr() - .wrap_err("failed to get local addr of listener")? - .port(); - let new_connections = TcpListenerStream::new(listener).map(|c| { - c.map(Event::NewConnection) - .wrap_err("failed to open connection") - .unwrap_or_else(Event::ConnectError) - }); - tracing::info!("Listening for node connections on 127.0.0.1:{port}"); - - let (dora_events_tx, dora_events_rx) = mpsc::channel(5); - let daemon = Self { - port, - uninit_shared_memory: Default::default(), - sent_out_shared_memory: Default::default(), - running: HashMap::new(), - dora_events_tx, - coordinator_addr, - machine_id, - }; - let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); - let events = (coordinator_events, new_connections, dora_events).merge(); - daemon.run_inner(events).await - } - - async fn run_inner( - mut self, - incoming_events: impl Stream + Unpin, - ) -> eyre::Result<()> { - let (node_events_tx, node_events_rx) = mpsc::channel(10); - let node_events = ReceiverStream::new(node_events_rx); - - let mut events = (incoming_events, node_events).merge(); - - while let Some(event) = events.next().await { - match event { - Event::NewConnection(connection) => { - let events_tx = node_events_tx.clone(); - tokio::spawn(listener::handle_connection(connection, events_tx)); - } - Event::ConnectError(err) => { - tracing::warn!("{:?}", err.wrap_err("failed to connect")); - } - Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { - let result = self.handle_coordinator_event(event).await; - let _ = reply_tx.send(DaemonCoordinatorReply::SpawnResult( - result.map_err(|err| format!("{err:?}")), - )); - } - Event::Node { - dataflow_id: dataflow, - node_id, - event, - reply_sender, - } => { - self.handle_node_event(event, dataflow, node_id, reply_sender) - .await? - } - Event::Dora(event) => self.handle_dora_event(event).await?, - Event::Drop(DropEvent { token }) => { - match self.sent_out_shared_memory.remove(&token) { - Some(rc) => { - if let Ok(_shmem) = Rc::try_unwrap(rc) { - tracing::trace!( - "freeing shared memory after receiving last drop token" - ) - } - } - None => tracing::warn!("received unknown drop token {token:?}"), - } - } - } - } - - Ok(()) - } - - async fn handle_coordinator_event( - &mut self, - event: DaemonCoordinatorEvent, - ) -> eyre::Result<()> { - match event { - DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { - let dataflow = match self.running.entry(dataflow_id) { - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(Default::default()) - } - std::collections::hash_map::Entry::Occupied(_) => { - bail!("there is already a running dataflow with ID `{dataflow_id}`") - } - }; - - for (node_id, params) in nodes { - for (input_id, mapping) in params.node.run_config.inputs.clone() { - match mapping { - InputMapping::User(mapping) => { - if mapping.operator.is_some() { - bail!("operators are not supported"); - } - dataflow - .mappings - .entry((mapping.source, mapping.output)) - .or_default() - .insert((node_id.clone(), input_id)); - } - InputMapping::Timer { interval } => { - dataflow - .timers - .entry(interval) - .or_default() - .insert((node_id.clone(), input_id)); - } - } - } - - spawn::spawn_node(dataflow_id, params, self.port, self.dora_events_tx.clone()) - .await - .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; - } - - // spawn timer tasks - for interval in dataflow.timers.keys().copied() { - let events_tx = self.dora_events_tx.clone(); - let task = async move { - let mut interval_stream = tokio::time::interval(interval); - let hlc = HLC::default(); - loop { - interval_stream.tick().await; - - let event = DoraEvent::Timer { - dataflow_id, - interval, - metadata: dora_message::Metadata::from_parameters( - hlc.new_timestamp(), - Default::default(), - ), - }; - if events_tx.send(event).await.is_err() { - break; - } - } - }; - let (task, handle) = task.remote_handle(); - tokio::spawn(task); - dataflow._timer_handles.push(handle); - } - - Ok(()) - } - DaemonCoordinatorEvent::StopDataflow { dataflow_id } => { - let dataflow = self - .running - .get_mut(&dataflow_id) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - - for channel in dataflow.subscribe_channels.values_mut() { - let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; - } - - Ok(()) - } - } - } - - async fn handle_node_event( - &mut self, - event: DaemonNodeEvent, - dataflow_id: DataflowId, - node_id: NodeId, - reply_sender: oneshot::Sender, - ) -> Result<(), eyre::ErrReport> { - match event { - DaemonNodeEvent::Subscribe { event_sender } => { - let result = match self.running.get_mut(&dataflow_id) { - Some(dataflow) => { - dataflow.subscribe_channels.insert(node_id, event_sender); - Ok(()) - } - None => Err(format!("no running dataflow with ID `{dataflow_id}`")), - }; - let _ = reply_sender.send(ControlReply::Result(result)); - } - DaemonNodeEvent::PrepareOutputMessage { - output_id, - metadata, - data_len, - } => { - let memory = ShmemConf::new() - .size(data_len) - .create() - .wrap_err("failed to allocate shared memory")?; - let id = memory.get_os_id().to_owned(); - self.uninit_shared_memory - .insert(id.clone(), (output_id, metadata, memory)); - - let reply = ControlReply::PreparedMessage { - shared_memory_id: id.clone(), - }; - if reply_sender.send(reply).is_err() { - // free shared memory slice again - self.uninit_shared_memory.remove(&id); - } - } - DaemonNodeEvent::SendOutMessage { id } => { - let (output_id, metadata, memory) = self - .uninit_shared_memory - .remove(&id) - .ok_or_else(|| eyre!("invalid shared memory id"))?; - - let memory = Rc::new(memory); - - let dataflow = self - .running - .get_mut(&dataflow_id) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - - // figure out receivers from dataflow graph - let empty_set = BTreeSet::new(); - let local_receivers = dataflow - .mappings - .get(&(node_id, output_id)) - .unwrap_or(&empty_set); - - // send shared memory ID to all local receivers - let mut closed = Vec::new(); - for (receiver_id, input_id) in local_receivers { - if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { - let drop_token = DropToken::generate(); - let send_result = channel.send_async(daemon_messages::NodeEvent::Input { - id: input_id.clone(), - metadata: metadata.clone(), - data: Some(daemon_messages::InputData { - shared_memory_id: id.clone(), - drop_token: drop_token.clone(), - }), - }); - - match timeout(Duration::from_millis(10), send_result).await { - Ok(Ok(())) => { - // keep shared memory ptr in order to free it once all subscribers are done - self.sent_out_shared_memory - .insert(drop_token, memory.clone()); - } - Ok(Err(_)) => { - closed.push(receiver_id); - } - Err(_) => { - tracing::warn!( - "dropping input event `{receiver_id}/{input_id}` (send timeout)" - ); - } - } - } - } - for id in closed { - dataflow.subscribe_channels.remove(id); - } - - // TODO send `data` via network to all remove receivers - let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); - - let _ = reply_sender.send(ControlReply::Result(Ok(()))); - } - DaemonNodeEvent::Stopped => { - tracing::info!("Stopped: {dataflow_id}/{node_id}"); - - let _ = reply_sender.send(ControlReply::Result(Ok(()))); - - // notify downstream nodes - let dataflow = self - .running - .get_mut(&dataflow_id) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - let downstream_nodes: BTreeSet<_> = dataflow - .mappings - .iter() - .filter(|((source_id, _), _)| source_id == &node_id) - .flat_map(|(_, v)| v) - .collect(); - for (receiver_id, input_id) in downstream_nodes { - let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { - continue; - }; - - let _ = channel - .send_async(daemon_messages::NodeEvent::InputClosed { - id: input_id.clone(), - }) - .await; - } - - // TODO: notify remote nodes - - dataflow.subscribe_channels.remove(&node_id); - if dataflow.subscribe_channels.is_empty() { - tracing::info!( - "Dataflow `{dataflow_id}` finished on machine `{}`", - self.machine_id - ); - if coordinator::send_event( - self.coordinator_addr, - self.machine_id.clone(), - DaemonEvent::AllNodesFinished { - dataflow_id, - result: Ok(()), - }, - ) - .await - .is_err() - { - tracing::warn!("failed to report dataflow finish to coordinator"); - } - self.running.remove(&dataflow_id); - } - } - } - Ok(()) - } - - async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { - match event { - DoraEvent::Timer { - dataflow_id, - interval, - metadata, - } => { - let Some(dataflow) = self.running.get_mut(&dataflow_id) else { - tracing::warn!("Timer event for unknown dataflow `{dataflow_id}`"); - return Ok(()) - }; - - let Some(subscribers) = dataflow.timers.get(&interval) else { - return Ok(()); - }; - - let mut closed = Vec::new(); - for (receiver_id, input_id) in subscribers { - let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { - continue; - }; - - let send_result = channel.send_async(daemon_messages::NodeEvent::Input { - id: input_id.clone(), - metadata: metadata.clone(), - data: None, - }); - match timeout(Duration::from_millis(1), send_result).await { - Ok(Ok(())) => {} - Ok(Err(_)) => { - closed.push(receiver_id); - } - Err(_) => { - tracing::info!( - "dropping timer tick event for `{receiver_id}` (send timeout)" - ); - } - } - } - for id in closed { - dataflow.subscribe_channels.remove(id); - } - } - DoraEvent::SpawnedNodeResult { - dataflow_id, - node_id, - result, - } => { - if self - .running - .get(&dataflow_id) - .and_then(|d| d.subscribe_channels.get(&node_id)) - .is_some() - { - tracing::warn!( - "node `{dataflow_id}/{node_id}` finished without sending `Stopped` message" - ); - } - match result { - Ok(()) => { - tracing::info!("node {dataflow_id}/{node_id} finished"); - } - Err(err) => { - tracing::error!( - "{:?}", - err.wrap_err(format!("error in node `{dataflow_id}/{node_id}`")) - ); - } - } - } - } - Ok(()) - } -} - -#[derive(Default)] -pub struct RunningDataflow { - subscribe_channels: HashMap>, - mappings: HashMap>, - timers: BTreeMap>, - /// Keep handles to all timer tasks of this dataflow to cancel them on drop. - _timer_handles: Vec>, -} - -type OutputId = (NodeId, DataId); -type InputId = (NodeId, DataId); - -#[derive(Debug)] -pub enum Event { - NewConnection(TcpStream), - ConnectError(eyre::Report), - Node { - dataflow_id: DataflowId, - node_id: NodeId, - event: DaemonNodeEvent, - reply_sender: oneshot::Sender, - }, - Coordinator(CoordinatorEvent), - Dora(DoraEvent), - Drop(DropEvent), -} - -#[derive(Debug)] -pub enum DaemonNodeEvent { - PrepareOutputMessage { - output_id: DataId, - metadata: dora_message::Metadata<'static>, - data_len: usize, - }, - SendOutMessage { - id: MessageId, - }, - Stopped, - Subscribe { - event_sender: flume::Sender, - }, -} - -#[derive(Debug)] -pub enum DoraEvent { - Timer { - dataflow_id: DataflowId, - interval: Duration, - metadata: dora_message::Metadata<'static>, - }, - SpawnedNodeResult { - dataflow_id: DataflowId, - node_id: NodeId, - result: eyre::Result<()>, - }, -} - -type MessageId = String; - fn set_up_tracing() -> eyre::Result<()> { use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; From 824da6a00c56e33a13b7a2fe2017b50917adc64b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 12:49:40 +0100 Subject: [PATCH 040/225] Make the coordinator connection optional --- binaries/daemon/src/lib.rs | 45 +++++++++++++++++++++---------------- binaries/daemon/src/main.rs | 2 +- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 1b9bd544..697fdba9 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -9,7 +9,7 @@ use dora_core::{ }; use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; -use futures::FutureExt; +use futures::{future::Either, stream, FutureExt}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ @@ -42,17 +42,22 @@ pub struct Daemon { dora_events_tx: mpsc::Sender, - coordinator_addr: SocketAddr, + coordinator_addr: Option, machine_id: String, } impl Daemon { - pub async fn run(coordinator_addr: SocketAddr, machine_id: String) -> eyre::Result<()> { + pub async fn run(coordinator_addr: Option, machine_id: String) -> eyre::Result<()> { // connect to the coordinator - let coordinator_events = coordinator::register(coordinator_addr, machine_id.clone()) - .await - .wrap_err("failed to connect to dora-coordinator")? - .map(Event::Coordinator); + let coordinator_events = match coordinator_addr { + Some(addr) => Either::Left( + coordinator::register(addr, machine_id.clone()) + .await + .wrap_err("failed to connect to dora-coordinator")? + .map(Event::Coordinator), + ), + None => Either::Right(stream::empty()), + }; // create listener for node connection let listener = listener::create_listener().await?; @@ -358,18 +363,20 @@ impl Daemon { "Dataflow `{dataflow_id}` finished on machine `{}`", self.machine_id ); - if coordinator::send_event( - self.coordinator_addr, - self.machine_id.clone(), - DaemonEvent::AllNodesFinished { - dataflow_id, - result: Ok(()), - }, - ) - .await - .is_err() - { - tracing::warn!("failed to report dataflow finish to coordinator"); + if let Some(addr) = self.coordinator_addr { + if coordinator::send_event( + addr, + self.machine_id.clone(), + DaemonEvent::AllNodesFinished { + dataflow_id, + result: Ok(()), + }, + ) + .await + .is_err() + { + tracing::warn!("failed to report dataflow finish to coordinator"); + } } self.running.remove(&dataflow_id); } diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index fb8a3c78..e8700f11 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -19,7 +19,7 @@ async fn run() -> eyre::Result<()> { let machine_id = String::new(); // TODO - Daemon::run(coordinator_socket.into(), machine_id).await + Daemon::run(Some(coordinator_socket.into()), machine_id).await } fn set_up_tracing() -> eyre::Result<()> { From 550062917bb9887c638654191030cc30187818cc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 13:42:03 +0100 Subject: [PATCH 041/225] Create test/examples entry point for daemon and use it to run rust-dataflow example --- Cargo.lock | 12 ++-- Cargo.toml | 5 +- binaries/daemon/src/lib.rs | 74 +++++++++++++++++----- binaries/daemon/src/main.rs | 2 +- examples/rust-dataflow/dataflow.yml | 10 --- examples/rust-dataflow/operator/Cargo.toml | 13 ---- examples/rust-dataflow/operator/src/lib.rs | 47 -------------- examples/rust-dataflow/run.rs | 59 ++++++++++++----- 8 files changed, 111 insertions(+), 111 deletions(-) delete mode 100644 examples/rust-dataflow/operator/Cargo.toml delete mode 100644 examples/rust-dataflow/operator/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index da86d159..d6285549 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1003,10 +1003,13 @@ dependencies = [ name = "dora-examples" version = "0.0.0" dependencies = [ - "dora-coordinator", + "dora-core", + "dora-daemon", "dunce", "eyre", + "serde_yaml 0.8.23", "tokio", + "uuid 1.2.1", ] [[package]] @@ -3320,13 +3323,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "rust-dataflow-example-operator" -version = "0.1.1" -dependencies = [ - "dora-operator-api", -] - [[package]] name = "rust-dataflow-example-sink" version = "0.1.1" diff --git a/Cargo.toml b/Cargo.toml index d98953ff..38873e2a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,8 +27,11 @@ license = "Apache-2.0" [dev-dependencies] eyre = "0.6.8" tokio = "1.20.1" -dora-coordinator = { path = "binaries/coordinator" } +dora-daemon = { path = "binaries/daemon" } +dora-core = { path = "libraries/core" } dunce = "1.0.2" +serde_yaml = "0.8.23" +uuid = { version = "1.2.1", features = ["v4", "serde"] } [[example]] name = "c-dataflow" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 697fdba9..b663dbd6 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -9,7 +9,7 @@ use dora_core::{ }; use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; -use futures::{future::Either, stream, FutureExt}; +use futures::{stream, FutureExt}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ @@ -27,6 +27,7 @@ use tokio_stream::{ wrappers::{ReceiverStream, TcpListenerStream}, Stream, StreamExt, }; +use uuid::Uuid; mod coordinator; mod listener; @@ -44,21 +45,45 @@ pub struct Daemon { coordinator_addr: Option, machine_id: String, + + /// used for testing and examples + exit_when_done: Option>, } impl Daemon { - pub async fn run(coordinator_addr: Option, machine_id: String) -> eyre::Result<()> { + pub async fn run(coordinator_addr: SocketAddr, machine_id: String) -> eyre::Result<()> { // connect to the coordinator - let coordinator_events = match coordinator_addr { - Some(addr) => Either::Left( - coordinator::register(addr, machine_id.clone()) - .await - .wrap_err("failed to connect to dora-coordinator")? - .map(Event::Coordinator), - ), - None => Either::Right(stream::empty()), - }; + let coordinator_events = coordinator::register(coordinator_addr, machine_id.clone()) + .await + .wrap_err("failed to connect to dora-coordinator")? + .map(Event::Coordinator); + Self::run_general(coordinator_events, Some(coordinator_addr), machine_id, None).await + } + pub async fn run_dataflow(dataflow: SpawnDataflowNodes) -> eyre::Result<()> { + let exit_when_done = [dataflow.dataflow_id].into(); + let coordinator_events = stream::once(async { DaemonCoordinatorEvent::Spawn(dataflow) }) + .map(|event| { + Event::Coordinator(CoordinatorEvent { + event, + reply_tx: oneshot::channel().0, + }) + }); + Self::run_general( + Box::pin(coordinator_events), + None, + "".into(), + Some(exit_when_done), + ) + .await + } + + async fn run_general( + external_events: impl Stream + Unpin, + coordinator_addr: Option, + machine_id: String, + exit_when_done: Option>, + ) -> eyre::Result<()> { // create listener for node connection let listener = listener::create_listener().await?; let port = listener @@ -81,9 +106,10 @@ impl Daemon { dora_events_tx, coordinator_addr, machine_id, + exit_when_done, }; let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); - let events = (coordinator_events, new_connections, dora_events).merge(); + let events = (external_events, new_connections, dora_events).merge(); daemon.run_inner(events).await } @@ -117,8 +143,13 @@ impl Daemon { event, reply_sender, } => { - self.handle_node_event(event, dataflow, node_id, reply_sender) + match self + .handle_node_event(event, dataflow, node_id, reply_sender) .await? + { + RunStatus::Continue => {} + RunStatus::Exit => break, + } } Event::Dora(event) => self.handle_dora_event(event).await?, Event::Drop(DropEvent { token }) => { @@ -234,7 +265,7 @@ impl Daemon { dataflow_id: DataflowId, node_id: NodeId, reply_sender: oneshot::Sender, - ) -> Result<(), eyre::ErrReport> { + ) -> eyre::Result { match event { DaemonNodeEvent::Subscribe { event_sender } => { let result = match self.running.get_mut(&dataflow_id) { @@ -379,10 +410,17 @@ impl Daemon { } } self.running.remove(&dataflow_id); + + if let Some(exit_when_done) = &mut self.exit_when_done { + exit_when_done.remove(&dataflow_id); + if exit_when_done.is_empty() { + return Ok(RunStatus::Exit); + } + } } } } - Ok(()) + Ok(RunStatus::Continue) } async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { @@ -518,3 +556,9 @@ pub enum DoraEvent { } type MessageId = String; + +#[must_use] +enum RunStatus { + Continue, + Exit, +} diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index e8700f11..fb8a3c78 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -19,7 +19,7 @@ async fn run() -> eyre::Result<()> { let machine_id = String::new(); // TODO - Daemon::run(Some(coordinator_socket.into()), machine_id).await + Daemon::run(coordinator_socket.into(), machine_id).await } fn set_up_tracing() -> eyre::Result<()> { diff --git a/examples/rust-dataflow/dataflow.yml b/examples/rust-dataflow/dataflow.yml index 4bf75ae9..110dc327 100644 --- a/examples/rust-dataflow/dataflow.yml +++ b/examples/rust-dataflow/dataflow.yml @@ -11,16 +11,6 @@ nodes: tick: dora/timer/millis/10 outputs: - random - # - id: runtime-node - # operators: - # - id: rust-operator - # build: cargo build -p rust-dataflow-example-operator - # shared-library: ../../target/debug/rust_dataflow_example_operator - # inputs: - # tick: dora/timer/millis/100 - # random: rust-node/random - # outputs: - # - status - id: rust-sink custom: build: cargo build -p rust-dataflow-example-sink diff --git a/examples/rust-dataflow/operator/Cargo.toml b/examples/rust-dataflow/operator/Cargo.toml deleted file mode 100644 index e18c676c..00000000 --- a/examples/rust-dataflow/operator/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "rust-dataflow-example-operator" -version = "0.1.1" -edition = "2021" -license = "Apache-2.0" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -crate-type = ["cdylib"] - -[dependencies] -dora-operator-api = { path = "../../../apis/rust/operator" } diff --git a/examples/rust-dataflow/operator/src/lib.rs b/examples/rust-dataflow/operator/src/lib.rs deleted file mode 100644 index 3c0713a1..00000000 --- a/examples/rust-dataflow/operator/src/lib.rs +++ /dev/null @@ -1,47 +0,0 @@ -#![warn(unsafe_op_in_unsafe_fn)] - -use dora_operator_api::{register_operator, DoraOperator, DoraOutputSender, DoraStatus}; -use std::time::{Duration, Instant}; - -register_operator!(ExampleOperator); - -#[derive(Debug, Default)] -struct ExampleOperator { - ticks: usize, - last_random_at: Option, -} - -impl DoraOperator for ExampleOperator { - fn on_input( - &mut self, - id: &str, - data: &[u8], - output_sender: &mut DoraOutputSender, - ) -> Result { - match id { - "tick" => { - self.ticks += 1; - } - "random" => { - let parsed = { - let data: [u8; 8] = data.try_into().map_err(|_| "unexpected random data")?; - u64::from_le_bytes(data) - }; - let output = format!( - "operator received random value {parsed} after {} ticks", - self.ticks - ); - output_sender.send("status".into(), output.into_bytes())?; - self.last_random_at = Some(Instant::now()); - } - other => eprintln!("ignoring unexpected input {other}"), - } - if let Some(last_random_at) = self.last_random_at { - if last_random_at.elapsed() > Duration::from_secs(1) { - // looks like the node sending the random values finished -> exit too - return Ok(DoraStatus::Stop); - } - } - Ok(DoraStatus::Continue) - } -} diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 9378905c..54fd131f 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -1,5 +1,11 @@ +use dora_core::{ + daemon_messages::{SpawnDataflowNodes, SpawnNodeParams}, + descriptor::{CoreNodeKind, Descriptor}, +}; use eyre::{bail, Context}; -use std::path::Path; +use std::{collections::BTreeMap, path::Path}; +use tokio::fs; +use uuid::Uuid; #[tokio::main] async fn main() -> eyre::Result<()> { @@ -9,13 +15,38 @@ async fn main() -> eyre::Result<()> { let dataflow = Path::new("dataflow.yml"); build_dataflow(dataflow).await?; - build_package("dora-runtime").await?; - dora_coordinator::run(dora_coordinator::Args { - run_dataflow: dataflow.to_owned().into(), - runtime: Some(root.join("target").join("debug").join("dora-runtime")), - }) - .await?; + let working_dir = dataflow + .canonicalize() + .context("failed to canoncialize dataflow path")? + .parent() + .ok_or_else(|| eyre::eyre!("canonicalized dataflow path has no parent"))? + .to_owned(); + + let nodes = read_descriptor(dataflow).await?.resolve_aliases(); + let mut custom_nodes = BTreeMap::new(); + for node in nodes { + match node.kind { + CoreNodeKind::Runtime(_) => todo!(), + CoreNodeKind::Custom(n) => { + custom_nodes.insert( + node.id.clone(), + SpawnNodeParams { + node_id: node.id, + node: n, + working_dir: working_dir.clone(), + }, + ); + } + } + } + + let spawn_command = SpawnDataflowNodes { + dataflow_id: Uuid::new_v4(), + nodes: custom_nodes, + }; + + dora_daemon::Daemon::run_dataflow(spawn_command).await?; Ok(()) } @@ -32,13 +63,9 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { Ok(()) } -async fn build_package(package: &str) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("build"); - cmd.arg("--package").arg(package); - if !cmd.status().await?.success() { - bail!("failed to build {package}"); - }; - Ok(()) +pub async fn read_descriptor(file: &Path) -> eyre::Result { + let descriptor_file = fs::read(file).await.context("failed to open given file")?; + let descriptor: Descriptor = + serde_yaml::from_slice(&descriptor_file).context("failed to parse given descriptor")?; + Ok(descriptor) } From 6b7f7f740634f29e031ac0e6a06191b198e282a8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 13:43:23 +0100 Subject: [PATCH 042/225] Remove `run_dataflow` argument from dora-coordinator This functionality is now provided by the daemon. --- binaries/coordinator/src/lib.rs | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 07a3bec7..bfbef430 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -34,15 +34,10 @@ mod tcp_utils; pub struct Args { #[clap(long)] pub runtime: Option, - #[clap(long)] - pub run_dataflow: Option, } pub async fn run(args: Args) -> eyre::Result<()> { - let Args { - runtime, - run_dataflow, - } = args; + let Args { runtime } = args; let runtime_path = runtime.unwrap_or_else(|| { std::env::args() @@ -52,16 +47,8 @@ pub async fn run(args: Args) -> eyre::Result<()> { .with_file_name("dora-runtime") }); - match run_dataflow { - Some(path) => { - // start the given dataflow directly - todo!(); - } - None => { - // start in daemon mode - start(&runtime_path).await?; - } - } + // start in daemon mode + start(&runtime_path).await?; Ok(()) } From 74b4c71ecdae3613669586d4b0371005a97fb3ed Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 13:43:39 +0100 Subject: [PATCH 043/225] Remove unneeded calls from coordinator --- binaries/coordinator/src/run/mod.rs | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index dddb01c4..894ca2f3 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -5,7 +5,7 @@ use dora_core::{ daemon_messages::{ DaemonCoordinatorEvent, DaemonCoordinatorReply, SpawnDataflowNodes, SpawnNodeParams, }, - descriptor::{collect_dora_timers, CoreNodeKind, Descriptor}, + descriptor::{CoreNodeKind, Descriptor}, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; use futures::{stream::FuturesUnordered, StreamExt}; @@ -38,7 +38,6 @@ pub async fn spawn_dataflow( .ok_or_else(|| eyre!("canonicalized dataflow path has no parent"))? .to_owned(); let nodes = descriptor.resolve_aliases(); - let dora_timers = collect_dora_timers(&nodes); let uuid = Uuid::new_v4(); let communication_config = { let mut config = descriptor.communication; @@ -125,17 +124,6 @@ pub struct SpawnedDataflow { pub machines: BTreeSet, } -pub async fn await_tasks( - mut tasks: FuturesUnordered>>, -) -> eyre::Result<()> { - while let Some(task_result) = tasks.next().await { - task_result - .wrap_err("failed to join async task")? - .wrap_err("custom node failed")?; - } - Ok(()) -} - async fn read_descriptor(file: &Path) -> Result { let descriptor_file = tokio::fs::read(file) .await From 611103b2115fa359a9d6331852bb13b62c02e773 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 13:47:20 +0100 Subject: [PATCH 044/225] Add more logging to `rust-dataflow` example --- Cargo.lock | 2 ++ Cargo.toml | 2 ++ binaries/daemon/src/lib.rs | 3 +++ examples/rust-dataflow/run.rs | 11 +++++++++++ 4 files changed, 18 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index d6285549..c11a74de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1009,6 +1009,8 @@ dependencies = [ "eyre", "serde_yaml 0.8.23", "tokio", + "tracing", + "tracing-subscriber", "uuid 1.2.1", ] diff --git a/Cargo.toml b/Cargo.toml index 38873e2a..e91c3d8c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,8 @@ dora-core = { path = "libraries/core" } dunce = "1.0.2" serde_yaml = "0.8.23" uuid = { version = "1.2.1", features = ["v4", "serde"] } +tracing = "0.1.36" +tracing-subscriber = "0.3.15" [[example]] name = "c-dataflow" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index b663dbd6..7a88b58f 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -414,6 +414,9 @@ impl Daemon { if let Some(exit_when_done) = &mut self.exit_when_done { exit_when_done.remove(&dataflow_id); if exit_when_done.is_empty() { + tracing::info!( + "exiting daemon because all required dataflows are finished" + ); return Ok(RunStatus::Exit); } } diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 54fd131f..1b9bb095 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -9,6 +9,8 @@ use uuid::Uuid; #[tokio::main] async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) .wrap_err("failed to set working dir")?; @@ -69,3 +71,12 @@ pub async fn read_descriptor(file: &Path) -> eyre::Result { serde_yaml::from_slice(&descriptor_file).context("failed to parse given descriptor")?; Ok(descriptor) } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer().pretty(); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} From 5db784ff27647340206ec0961a8681317b188a78 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 16:28:05 +0100 Subject: [PATCH 045/225] Move descriptor reading to dora daemon + add cli arg to run dataflow --- Cargo.lock | 1 + binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/lib.rs | 79 ++++++++++++++++++++++++++++------- binaries/daemon/src/main.rs | 30 ++++++++++--- examples/rust-dataflow/run.rs | 47 +-------------------- 5 files changed, 93 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c11a74de..2458b1c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -970,6 +970,7 @@ dependencies = [ name = "dora-daemon" version = "0.1.0" dependencies = [ + "clap 3.2.20", "dora-core", "dora-download", "dora-message", diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 2b33f259..e6f75685 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -22,3 +22,4 @@ dora-download = { path = "../../libraries/extensions/download" } serde_yaml = "0.8.23" uuid = { version = "1.1.2", features = ["v4"] } futures = "0.3.25" +clap = { version = "3.1.8", features = ["derive"] } diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 7a88b58f..a03f7fcc 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -4,21 +4,24 @@ use dora_core::{ coordinator_messages::DaemonEvent, daemon_messages::{ self, ControlReply, DaemonCoordinatorEvent, DaemonCoordinatorReply, DataflowId, DropEvent, - DropToken, SpawnDataflowNodes, + DropToken, SpawnDataflowNodes, SpawnNodeParams, }, + descriptor::{CoreNodeKind, Descriptor}, }; use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; -use futures::{stream, FutureExt}; +use futures::{future, stream, FutureExt, TryFutureExt}; use futures_concurrency::stream::Merge; use shared_memory::{Shmem, ShmemConf}; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, net::SocketAddr, + path::Path, rc::Rc, time::Duration, }; use tokio::{ + fs, net::TcpStream, sync::{mpsc, oneshot}, time::timeout, @@ -60,22 +63,63 @@ impl Daemon { Self::run_general(coordinator_events, Some(coordinator_addr), machine_id, None).await } - pub async fn run_dataflow(dataflow: SpawnDataflowNodes) -> eyre::Result<()> { - let exit_when_done = [dataflow.dataflow_id].into(); - let coordinator_events = stream::once(async { DaemonCoordinatorEvent::Spawn(dataflow) }) - .map(|event| { - Event::Coordinator(CoordinatorEvent { - event, - reply_tx: oneshot::channel().0, - }) - }); - Self::run_general( + pub async fn run_dataflow(dataflow_path: &Path) -> eyre::Result<()> { + let working_dir = dataflow_path + .canonicalize() + .context("failed to canoncialize dataflow path")? + .parent() + .ok_or_else(|| eyre::eyre!("canonicalized dataflow path has no parent"))? + .to_owned(); + + let nodes = read_descriptor(dataflow_path).await?.resolve_aliases(); + let mut custom_nodes = BTreeMap::new(); + for node in nodes { + match node.kind { + CoreNodeKind::Runtime(_) => todo!(), + CoreNodeKind::Custom(n) => { + custom_nodes.insert( + node.id.clone(), + SpawnNodeParams { + node_id: node.id, + node: n, + working_dir: working_dir.clone(), + }, + ); + } + } + } + + let spawn_command = SpawnDataflowNodes { + dataflow_id: Uuid::new_v4(), + nodes: custom_nodes, + }; + + let exit_when_done = [spawn_command.dataflow_id].into(); + let (reply_tx, reply_rx) = oneshot::channel(); + let coordinator_events = stream::once(async move { + Event::Coordinator(CoordinatorEvent { + event: DaemonCoordinatorEvent::Spawn(spawn_command), + reply_tx, + }) + }); + let run_result = Self::run_general( Box::pin(coordinator_events), None, "".into(), Some(exit_when_done), - ) - .await + ); + + let spawn_result = reply_rx + .map_err(|err| eyre!("failed to receive spawn result: {err}")) + .and_then(|r| async { + match r { + DaemonCoordinatorReply::SpawnResult(result) => result.map_err(|err| eyre!(err)), + DaemonCoordinatorReply::StopResult(_) => Err(eyre!("unexpected spawn reply")), + } + }); + + future::try_join(run_result, spawn_result).await?; + Ok(()) } async fn run_general( @@ -565,3 +609,10 @@ enum RunStatus { Continue, Exit, } + +pub async fn read_descriptor(file: &Path) -> eyre::Result { + let descriptor_file = fs::read(file).await.wrap_err("failed to open given file")?; + let descriptor: Descriptor = + serde_yaml::from_slice(&descriptor_file).context("failed to parse given descriptor")?; + Ok(descriptor) +} diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index fb8a3c78..8befc2f8 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -1,7 +1,14 @@ use dora_core::topics::DORA_COORDINATOR_PORT_DEFAULT; use dora_daemon::Daemon; use eyre::Context; -use std::net::Ipv4Addr; +use std::{net::Ipv4Addr, path::PathBuf}; + +#[derive(Debug, Clone, clap::Parser)] +#[clap(about = "Dora daemon")] +pub struct Args { + #[clap(long)] + pub run_dataflow: Option, +} #[tokio::main] async fn main() -> eyre::Result<()> { @@ -13,13 +20,24 @@ async fn main() -> eyre::Result<()> { async fn run() -> eyre::Result<()> { set_up_tracing().wrap_err("failed to set up tracing subscriber")?; - tracing::info!("Starting in local mode"); - let localhost = Ipv4Addr::new(127, 0, 0, 1); - let coordinator_socket = (localhost, DORA_COORDINATOR_PORT_DEFAULT); + let Args { run_dataflow } = clap::Parser::parse(); + + match run_dataflow { + Some(dataflow_path) => { + tracing::info!("Starting dataflow `{}`", dataflow_path.display()); + + Daemon::run_dataflow(&dataflow_path).await + } + None => { + tracing::info!("Starting in local mode"); + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let coordinator_socket = (localhost, DORA_COORDINATOR_PORT_DEFAULT); - let machine_id = String::new(); // TODO + let machine_id = String::new(); // TODO - Daemon::run(coordinator_socket.into(), machine_id).await + Daemon::run(coordinator_socket.into(), machine_id).await + } + } } fn set_up_tracing() -> eyre::Result<()> { diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 1b9bb095..cddc0ddc 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -1,11 +1,5 @@ -use dora_core::{ - daemon_messages::{SpawnDataflowNodes, SpawnNodeParams}, - descriptor::{CoreNodeKind, Descriptor}, -}; use eyre::{bail, Context}; -use std::{collections::BTreeMap, path::Path}; -use tokio::fs; -use uuid::Uuid; +use std::path::Path; #[tokio::main] async fn main() -> eyre::Result<()> { @@ -18,37 +12,7 @@ async fn main() -> eyre::Result<()> { let dataflow = Path::new("dataflow.yml"); build_dataflow(dataflow).await?; - let working_dir = dataflow - .canonicalize() - .context("failed to canoncialize dataflow path")? - .parent() - .ok_or_else(|| eyre::eyre!("canonicalized dataflow path has no parent"))? - .to_owned(); - - let nodes = read_descriptor(dataflow).await?.resolve_aliases(); - let mut custom_nodes = BTreeMap::new(); - for node in nodes { - match node.kind { - CoreNodeKind::Runtime(_) => todo!(), - CoreNodeKind::Custom(n) => { - custom_nodes.insert( - node.id.clone(), - SpawnNodeParams { - node_id: node.id, - node: n, - working_dir: working_dir.clone(), - }, - ); - } - } - } - - let spawn_command = SpawnDataflowNodes { - dataflow_id: Uuid::new_v4(), - nodes: custom_nodes, - }; - - dora_daemon::Daemon::run_dataflow(spawn_command).await?; + dora_daemon::Daemon::run_dataflow(dataflow).await?; Ok(()) } @@ -65,13 +29,6 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { Ok(()) } -pub async fn read_descriptor(file: &Path) -> eyre::Result { - let descriptor_file = fs::read(file).await.context("failed to open given file")?; - let descriptor: Descriptor = - serde_yaml::from_slice(&descriptor_file).context("failed to parse given descriptor")?; - Ok(descriptor) -} - fn set_up_tracing() -> eyre::Result<()> { use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; From 2e13ef54860f2e50423071c59a2d6e8525233857 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 16:29:15 +0100 Subject: [PATCH 046/225] Set `stdin` to `null` when spawning nodes --- binaries/daemon/src/spawn.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 3fd98429..bc94c466 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -5,7 +5,7 @@ use dora_core::{ }; use dora_download::download_file; use eyre::{eyre, WrapErr}; -use std::{env::consts::EXE_EXTENSION, path::Path}; +use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; use tokio::sync::mpsc; #[tracing::instrument] @@ -21,6 +21,8 @@ pub async fn spawn_node( working_dir, } = params; + tracing::trace!("Spawning node `{dataflow_id}/{node_id}`"); + let resolved_path = if source_is_url(&node.source) { // try to download the shared library let target_path = Path::new("build") @@ -58,6 +60,7 @@ pub async fn spawn_node( command.env(key, value.to_string()); } } + command.stdin(Stdio::null()); let mut child = command.spawn().wrap_err_with(move || { format!( From cf89bf0d1cd8069d4fdd484a73c862ca6ceff172 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 16:29:29 +0100 Subject: [PATCH 047/225] Fix error messages --- apis/rust/node/src/daemon.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 68bc0377..43eb709c 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -68,9 +68,9 @@ impl ControlChannel { pub fn report_stop(&mut self) -> eyre::Result<()> { tcp_send(&mut self.0, &ControlRequest::Stopped) - .wrap_err("failed to send subscribe request to dora-daemon")?; + .wrap_err("failed to report stopped to dora-daemon")?; match tcp_receive(&mut self.0) - .wrap_err("failed to receive subscribe reply from dora-daemon")? + .wrap_err("failed to receive stopped reply from dora-daemon")? { dora_core::daemon_messages::ControlReply::Result(result) => result .map_err(|e| eyre!(e)) From 9664f0fd77d194a472865a3fcd80244872435882 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 16:29:51 +0100 Subject: [PATCH 048/225] Fix Python API --- apis/python/node/src/lib.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 61c102aa..3b3df768 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -4,7 +4,6 @@ use std::ops::Deref; use dora_node_api::{ daemon::{Event, EventStream}, - dora_core::config::NodeId, DoraNode, }; use dora_operator_api_python::{metadata_to_pydict, pydict_to_metadata}; @@ -16,7 +15,6 @@ use pyo3::{ #[pyclass] pub struct Node { - id: NodeId, events: EventStream, node: DoraNode, } @@ -45,15 +43,9 @@ impl IntoPy for PyInput<'_> { impl Node { #[new] pub fn new() -> Result { - let id = { - let raw = - std::env::var("DORA_NODE_ID").wrap_err("env variable DORA_NODE_ID must be set")?; - serde_yaml::from_str(&raw).context("failed to deserialize operator config")? - }; - let (node, events) = DoraNode::init_from_env()?; - Ok(Node { id, events, node }) + Ok(Node { events, node }) } #[allow(clippy::should_implement_trait)] @@ -85,7 +77,7 @@ impl Node { } pub fn id(&self) -> String { - self.id.to_string() + self.node.id().to_string() } } @@ -97,9 +89,9 @@ impl Node { // Ok(()) // } -// #[pymodule] -// fn dora(_py: Python, m: &PyModule) -> PyResult<()> { -// m.add_function(wrap_pyfunction!(start_runtime, m)?)?; -// m.add_class::().unwrap(); -// Ok(()) -// } +#[pymodule] +fn dora(_py: Python, m: &PyModule) -> PyResult<()> { + // m.add_function(wrap_pyfunction!(start_runtime, m)?)?; + m.add_class::().unwrap(); + Ok(()) +} From 1516ee6559c291e41827fd1f559f06a62aa13ed6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 17:18:00 +0100 Subject: [PATCH 049/225] Don't allocate shared memory when `data_len=0` when preparing output --- apis/rust/node/src/lib.rs | 4 +++- binaries/daemon/src/lib.rs | 45 ++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index f12e6a70..3919fcda 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -75,7 +75,7 @@ impl DoraNode { .wrap_err("failed to prepare sample for output message")?; // map shared memory and fill in data - { + if data_len > 0 { let mut shared_memory = ShmemConf::new() .os_id(&sample.id) .open() @@ -83,6 +83,8 @@ impl DoraNode { let raw = unsafe { shared_memory.as_slice_mut() }; data(raw); + } else { + data(&mut []); } self.control_channel diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index a03f7fcc..6619060b 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -39,7 +39,7 @@ mod tcp_utils; pub struct Daemon { port: u16, - uninit_shared_memory: HashMap, Shmem)>, + prepared_messages: HashMap, Option)>, sent_out_shared_memory: HashMap>, running: HashMap, @@ -144,7 +144,7 @@ impl Daemon { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let daemon = Self { port, - uninit_shared_memory: Default::default(), + prepared_messages: Default::default(), sent_out_shared_memory: Default::default(), running: HashMap::new(), dora_events_tx, @@ -326,12 +326,21 @@ impl Daemon { metadata, data_len, } => { - let memory = ShmemConf::new() - .size(data_len) - .create() - .wrap_err("failed to allocate shared memory")?; - let id = memory.get_os_id().to_owned(); - self.uninit_shared_memory + let memory = if data_len > 0 { + Some( + ShmemConf::new() + .size(data_len) + .create() + .wrap_err("failed to allocate shared memory")?, + ) + } else { + None + }; + let id = memory + .as_ref() + .map(|m| m.get_os_id().to_owned()) + .unwrap_or_else(|| Uuid::new_v4().to_string()); + self.prepared_messages .insert(id.clone(), (output_id, metadata, memory)); let reply = ControlReply::PreparedMessage { @@ -339,16 +348,16 @@ impl Daemon { }; if reply_sender.send(reply).is_err() { // free shared memory slice again - self.uninit_shared_memory.remove(&id); + self.prepared_messages.remove(&id); } } DaemonNodeEvent::SendOutMessage { id } => { let (output_id, metadata, memory) = self - .uninit_shared_memory + .prepared_messages .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; - let memory = Rc::new(memory); + let memory = memory.map(Rc::new); let dataflow = self .running @@ -370,8 +379,8 @@ impl Daemon { let send_result = channel.send_async(daemon_messages::NodeEvent::Input { id: input_id.clone(), metadata: metadata.clone(), - data: Some(daemon_messages::InputData { - shared_memory_id: id.clone(), + data: memory.as_ref().map(|m| daemon_messages::InputData { + shared_memory_id: m.get_os_id().to_owned(), drop_token: drop_token.clone(), }), }); @@ -379,8 +388,10 @@ impl Daemon { match timeout(Duration::from_millis(10), send_result).await { Ok(Ok(())) => { // keep shared memory ptr in order to free it once all subscribers are done - self.sent_out_shared_memory - .insert(drop_token, memory.clone()); + if let Some(memory) = &memory { + self.sent_out_shared_memory + .insert(drop_token, memory.clone()); + } } Ok(Err(_)) => { closed.push(receiver_id); @@ -398,7 +409,9 @@ impl Daemon { } // TODO send `data` via network to all remove receivers - let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + if let Some(memory) = &memory { + let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + } let _ = reply_sender.send(ControlReply::Result(Ok(()))); } From ccc17ddf65c65f5f669f94b6fb24f1f9b42a6c59 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 17:19:03 +0100 Subject: [PATCH 050/225] Python API: Pass events as dict --- apis/python/node/src/lib.rs | 54 ++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 3b3df768..617cc220 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -1,7 +1,5 @@ #![allow(clippy::borrow_deref_ref)] // clippy warns about code generated by #[pymethods] -use std::ops::Deref; - use dora_node_api::{ daemon::{Event, EventStream}, DoraNode, @@ -23,19 +21,45 @@ pub struct PyInput<'a>(Event<'a>); impl IntoPy for PyInput<'_> { fn into_py(self, py: Python) -> PyObject { - match self.0 { - Event::Stop => ("stop").into_py(py), - Event::Input { id, metadata, data } => ( - "input", - id.to_string(), - PyBytes::new(py, data.as_deref().unwrap_or_default()), - metadata_to_pydict(&metadata, py), - ) - .into_py(py), - Event::InputClosed { id } => ("input-closed", id.deref()).into_py(py), - Event::Error(err) => ("error", err).into_py(py), - other => ("unknown", format!("{other:?}")).into_py(py), - } + let dict = PyDict::new(py); + + let ty = match self.0 { + Event::Stop => "stop", + Event::Input { id, metadata, data } => { + dict.set_item("id", id.to_string()) + .wrap_err("failed to add input ID") + .unwrap(); + dict.set_item( + "data", + PyBytes::new(py, data.as_deref().unwrap_or_default()), + ) + .wrap_err("failed to add input data") + .unwrap(); + dict.set_item("metadata", metadata_to_pydict(&metadata, py)) + .wrap_err("failed to add input metadata") + .unwrap(); + "input" + } + Event::InputClosed { id } => { + dict.set_item("id", id.to_string()) + .wrap_err("failed to add clsoed-input ID") + .unwrap(); + "input-closed" + } + Event::Error(err) => { + dict.set_item("error", err) + .wrap_err("failed to add error") + .unwrap(); + "error" + } + _other => "unknown", + }; + + dict.set_item("type", ty) + .wrap_err("could not make type a python dictionary item") + .unwrap(); + + dict.into() } } From c5db53a4c6300d5d61464720839da25f0c4102ce Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 17:20:18 +0100 Subject: [PATCH 051/225] Update Python dataflow example for new daemon API --- examples/python-dataflow/dataflow.yml | 8 +-- .../dataflow_without_webcam.yml | 8 +-- examples/python-dataflow/no_webcam.py | 17 +++-- examples/python-dataflow/object_detection.py | 62 +++++++++---------- examples/python-dataflow/plot.py | 38 +++++++++--- examples/python-dataflow/run.rs | 13 +++- examples/python-dataflow/run.sh | 2 +- examples/python-dataflow/webcam.py | 16 +++-- 8 files changed, 105 insertions(+), 59 deletions(-) mode change 100644 => 100755 examples/python-dataflow/object_detection.py mode change 100644 => 100755 examples/python-dataflow/plot.py diff --git a/examples/python-dataflow/dataflow.yml b/examples/python-dataflow/dataflow.yml index 1aa1d0ac..b7e574dc 100644 --- a/examples/python-dataflow/dataflow.yml +++ b/examples/python-dataflow/dataflow.yml @@ -12,16 +12,16 @@ nodes: - image - id: object_detection - operator: - python: object_detection.py + custom: + source: ./object_detection.py inputs: image: webcam/image outputs: - bbox - id: plot - operator: - python: plot.py + custom: + source: ./plot.py inputs: image: webcam/image bbox: object_detection/bbox diff --git a/examples/python-dataflow/dataflow_without_webcam.yml b/examples/python-dataflow/dataflow_without_webcam.yml index 6b9a00af..1a5fed05 100644 --- a/examples/python-dataflow/dataflow_without_webcam.yml +++ b/examples/python-dataflow/dataflow_without_webcam.yml @@ -12,16 +12,16 @@ nodes: - image - id: object_detection - operator: - python: object_detection.py + custom: + source: ./object_detection.py inputs: image: no_webcam/image outputs: - bbox - id: plot - operator: - python: plot.py + custom: + source: ./plot.py inputs: image: no_webcam/image bbox: object_detection/bbox diff --git a/examples/python-dataflow/no_webcam.py b/examples/python-dataflow/no_webcam.py index 77ebb249..68de6f9a 100755 --- a/examples/python-dataflow/no_webcam.py +++ b/examples/python-dataflow/no_webcam.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- import time @@ -17,7 +17,16 @@ start = time.time() while time.time() - start < 20: # Wait next dora_input - node.next() - node.send_output("image", arr.tobytes()) + event = node.next() + match event["type"]: + case "input": + print("received input", event["id"]) + node.send_output("image", arr.tobytes()) + case "stop": + print("received stop") + break + case other: + print("received unexpected event:", other) + break -time.sleep(1) + time.sleep(1) diff --git a/examples/python-dataflow/object_detection.py b/examples/python-dataflow/object_detection.py old mode 100644 new mode 100755 index 05c66b1b..ef33e946 --- a/examples/python-dataflow/object_detection.py +++ b/examples/python-dataflow/object_detection.py @@ -1,40 +1,36 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + from enum import Enum from typing import Callable +from dora import Node import cv2 import numpy as np import torch - -class DoraStatus(Enum): - CONTINUE = 0 - STOP = 1 - - -class Operator: - """ - Infering object from images - """ - - def __init__(self): - self.model = torch.hub.load("ultralytics/yolov5", "yolov5n") - - def on_input( - self, - dora_input: dict, - send_output: Callable[[str, bytes], None], - ) -> DoraStatus: - """Handle image - - Args: - dora_input (dict): Dict containing the "id", "data", and "metadata" - send_output (Callable[[str, bytes]]): Function enabling sending output back to dora. - """ - - frame = np.frombuffer(dora_input["data"], dtype="uint8") - frame = cv2.imdecode(frame, -1) - frame = frame[:, :, ::-1] # OpenCV image (BGR to RGB) - results = self.model(frame) # includes NMS - arrays = np.array(results.xyxy[0].cpu()).tobytes() - send_output("bbox", arrays, dora_input["metadata"]) - return DoraStatus.CONTINUE +model = torch.hub.load("ultralytics/yolov5", "yolov5n") + +node = Node() + +for event in node: + match event["type"]: + case "input": + match event["id"]: + case "image": + print("received image input") + frame = np.frombuffer(event["data"], dtype="uint8") + frame = cv2.imdecode(frame, -1) + frame = frame[:, :, ::-1] # OpenCV image (BGR to RGB) + results = model(frame) # includes NMS + arrays = np.array(results.xyxy[0].cpu()).tobytes() + + node.send_output("bbox", arrays, event["metadata"]) + case other: + print("ignoring unexpected input:", other) + case "stop": + print("received stop") + break + case other: + print("received unexpected event:", other) + break diff --git a/examples/python-dataflow/plot.py b/examples/python-dataflow/plot.py old mode 100644 new mode 100755 index 57a2a293..e3107702 --- a/examples/python-dataflow/plot.py +++ b/examples/python-dataflow/plot.py @@ -1,6 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import os from enum import Enum from typing import Callable +from dora import Node import cv2 import numpy as np @@ -11,13 +15,12 @@ CI = os.environ.get("CI") font = cv2.FONT_HERSHEY_SIMPLEX - -class DoraStatus(Enum): +class Status(Enum): CONTINUE = 0 STOP = 1 -class Operator: +class Plotter: """ Plot image and bounding box """ @@ -29,15 +32,13 @@ class Operator: def on_input( self, dora_input: dict, - send_output: Callable[[str, bytes], None], - ) -> DoraStatus: + ) -> Status: """ Put image and bounding box on cv2 window. Args: dora_input["id"] (str): Id of the dora_input declared in the yaml configuration dora_input["data"] (bytes): Bytes message of the dora_input - send_output (Callable[[str, bytes]]): Function enabling sending output back to dora. """ if dora_input["id"] == "image": frame = np.frombuffer(dora_input["data"], dtype="uint8") @@ -78,9 +79,30 @@ class Operator: if CI != "true": cv2.imshow("frame", self.image) if cv2.waitKey(1) & 0xFF == ord("q"): - return DoraStatus.STOP + return Status.STOP - return DoraStatus.CONTINUE + return Status.CONTINUE def __del__(self): cv2.destroyAllWindows() + + +plotter = Plotter() +node = Node() + +for event in node: + match event["type"]: + case "input": + status = plotter.on_input(event) + match status: + case Status.CONTINUE: + pass + case Status.STOP: + print("plotter returned stop status") + break + case "stop": + print("received stop") + break + case other: + print("received unexpected event:", other) + break diff --git a/examples/python-dataflow/run.rs b/examples/python-dataflow/run.rs index 9c21b6ab..ac32ff00 100644 --- a/examples/python-dataflow/run.rs +++ b/examples/python-dataflow/run.rs @@ -3,11 +3,13 @@ use std::{env, path::Path}; #[tokio::main] async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) .wrap_err("failed to set working dir")?; - build_package("dora-runtime").await?; + build_package("dora-daemon").await?; run(root).await?; @@ -33,3 +35,12 @@ async fn run(_root: &Path) -> eyre::Result<()> { }; Ok(()) } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer().pretty(); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} diff --git a/examples/python-dataflow/run.sh b/examples/python-dataflow/run.sh index 51ee97a3..c4036109 100644 --- a/examples/python-dataflow/run.sh +++ b/examples/python-dataflow/run.sh @@ -10,4 +10,4 @@ cd ../../../examples/python-dataflow pip install --upgrade pip pip install -r requirements.txt -cargo run -p dora-coordinator -- --run-dataflow dataflow_without_webcam.yml +cargo run -p dora-daemon -- --run-dataflow dataflow_without_webcam.yml diff --git a/examples/python-dataflow/webcam.py b/examples/python-dataflow/webcam.py index a44d776a..435fb5ec 100755 --- a/examples/python-dataflow/webcam.py +++ b/examples/python-dataflow/webcam.py @@ -15,9 +15,17 @@ start = time.time() # Run for 20 seconds while time.time() - start < 10: # Wait next dora_input - node.next() - ret, frame = video_capture.read() - if ret: - node.send_output("image", cv2.imencode(".jpg", frame)[1].tobytes()) + event = node.next() + match event["type"]: + case "input": + ret, frame = video_capture.read() + if ret: + node.send_output("image", cv2.imencode(".jpg", frame)[1].tobytes()) + case "stop": + print("received stop") + break + case other: + print("received unexpected event:", other) + break video_capture.release() From 15a5b8931de0aa1dd6de220c7c7dba6703a2be60 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 17:29:42 +0100 Subject: [PATCH 052/225] Use Python 3.10 for now (for match statements) --- .github/workflows/ci-python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index d04d8e24..0026f32b 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -20,7 +20,7 @@ jobs: sudo apt-get install -y libacl1-dev - uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.10 - uses: r7kamura/rust-problem-matchers@v1.1.0 - run: cargo --version --verbose From 8775aa407c3784522953c4930e46950690b642b6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 17:31:12 +0100 Subject: [PATCH 053/225] Allow manually triggering Python CI runs --- .github/workflows/ci-python.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index 0026f32b..0b6f7941 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -7,6 +7,7 @@ on: - apis/python/** - binaries/runtime/** pull_request: + workflow_dispatch: jobs: examples: From 6f9ae2523a18cdf2a0b9d44bcd8f06c9626e7ca5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 27 Dec 2022 18:05:05 +0100 Subject: [PATCH 054/225] Fix python version --- .github/workflows/ci-python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index 0b6f7941..2a76bd60 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -21,7 +21,7 @@ jobs: sudo apt-get install -y libacl1-dev - uses: actions/setup-python@v2 with: - python-version: 3.10 + python-version: "3.10" - uses: r7kamura/rust-problem-matchers@v1.1.0 - run: cargo --version --verbose From 453c40d20b3c66f06026d97a75313dca1f0af5f8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 15:56:55 +0100 Subject: [PATCH 055/225] Update C node API for new daemon API --- apis/c/node/node_api.h | 19 ++++-- apis/c/node/src/lib.rs | 146 ++++++++++++++++++++++++++++------------- 2 files changed, 114 insertions(+), 51 deletions(-) diff --git a/apis/c/node/node_api.h b/apis/c/node/node_api.h index 6de486ce..4ad43888 100644 --- a/apis/c/node/node_api.h +++ b/apis/c/node/node_api.h @@ -3,9 +3,20 @@ void *init_dora_context_from_env(); void free_dora_context(void *dora_context); -void *dora_next_input(void *dora_context); -void read_dora_input_id(void *dora_input, char **out_ptr, size_t *out_len); -void read_dora_input_data(void *dora_input, char **out_ptr, size_t *out_len); -void free_dora_input(void *dora_input); +void *dora_next_event(void *dora_context); +void free_dora_event(void *dora_event); + +enum EventType +{ + Stop, + Input, + InputClosed, + Error, + Unknown, +}; +enum EventType read_dora_event_type(void *dora_event); + +void read_dora_input_id(void *dora_event, char **out_ptr, size_t *out_len); +void read_dora_input_data(void *dora_event, char **out_ptr, size_t *out_len); int dora_send_output(void *dora_context, char *id_ptr, size_t id_len, char *data_ptr, size_t data_len); diff --git a/apis/c/node/src/lib.rs b/apis/c/node/src/lib.rs index 0760adb9..5f13ab84 100644 --- a/apis/c/node/src/lib.rs +++ b/apis/c/node/src/lib.rs @@ -1,12 +1,15 @@ #![deny(unsafe_op_in_unsafe_fn)] -use dora_node_api::{DoraNode, Input}; +use dora_node_api::{ + daemon::{Event, EventStream}, + DoraNode, +}; use eyre::Context; use std::{ffi::c_void, ptr, slice}; struct DoraContext { node: &'static mut DoraNode, - inputs: flume::Receiver, + events: EventStream, } /// Initializes a dora context from the environment variables that were set by @@ -21,10 +24,9 @@ struct DoraContext { #[no_mangle] pub extern "C" fn init_dora_context_from_env() -> *mut c_void { let context = || { - let node = DoraNode::init_from_env()?; + let (node, events) = DoraNode::init_from_env()?; let node = Box::leak(Box::new(node)); - let inputs = node.inputs()?; - Result::<_, eyre::Report>::Ok(DoraContext { node, inputs }) + Result::<_, eyre::Report>::Ok(DoraContext { node, events }) }; let context = match context().context("failed to initialize node") { Ok(n) => n, @@ -54,15 +56,15 @@ pub unsafe extern "C" fn free_dora_context(context: *mut c_void) { let _ = unsafe { Box::from_raw(node as *const DoraNode as *mut DoraNode) }; } -/// Waits for the next incoming input for the node. +/// Waits for the next incoming event for the node. /// -/// Returns a pointer to the input on success. This pointer must not be used -/// directly. Instead, use the `read_dora_input_*` functions to read out the -/// ID and data of the input. When the input is not needed anymore, use -/// [`free_dora_input`] to free it again. +/// Returns a pointer to the event on success. This pointer must not be used +/// directly. Instead, use the `read_dora_event_*` functions to read out the +/// type and payload of the event. When the event is not needed anymore, use +/// [`free_dora_event`] to free it again. /// -/// Returns a null pointer when all input streams were closed. This means that -/// no more input will be available. Nodes typically react by stopping. +/// Returns a null pointer when all event streams were closed. This means that +/// no more event will be available. Nodes typically react by stopping. /// /// ## Safety /// @@ -70,83 +72,133 @@ pub unsafe extern "C" fn free_dora_context(context: *mut c_void) { /// [`init_dora_context_from_env`]. The context must be still valid, i.e., not /// freed yet. #[no_mangle] -pub unsafe extern "C" fn dora_next_input(context: *mut c_void) -> *mut c_void { +pub unsafe extern "C" fn dora_next_event(context: *mut c_void) -> *mut c_void { let context: &mut DoraContext = unsafe { &mut *context.cast() }; - match context.inputs.recv() { - Ok(input) => Box::into_raw(Box::new(input)).cast(), - Err(flume::RecvError::Disconnected) => ptr::null_mut(), + match context.events.recv() { + Some(event) => Box::into_raw(Box::new(event)).cast(), + None => ptr::null_mut(), } } -/// Reads out the ID of the given input. +/// Reads out the type of the given event. +/// +/// ## Safety +/// +/// The `event` argument must be a dora event received through +/// [`dora_next_event`]. The event must be still valid, i.e., not +/// freed yet. +#[no_mangle] +pub unsafe extern "C" fn read_dora_event_type(event: *const ()) -> EventType { + let event: &Event = unsafe { &*event.cast() }; + match event { + Event::Stop => EventType::Stop, + Event::Input { .. } => EventType::Input, + Event::InputClosed { .. } => EventType::InputClosed, + Event::Error(_) => EventType::Error, + _ => EventType::Unknown, + } +} + +#[repr(C)] +pub enum EventType { + Stop, + Input, + InputClosed, + Error, + Unknown, +} + +/// Reads out the ID of the given input event. /// /// Writes the `out_ptr` and `out_len` with the start pointer and length of the /// ID string of the input. The ID is guaranteed to be valid UTF-8. /// +/// Writes a null pointer and length `0` if the given event is not an input event. +/// /// ## Safety /// -/// The `input` argument must be a dora input received through -/// [`dora_next_input`]. The input must be still valid, i.e., not +/// The `event` argument must be a dora event received through +/// [`dora_next_event`]. The event must be still valid, i.e., not /// freed yet. The returned `out_ptr` must not be used after -/// freeing the `input`, since it points directly into the input's +/// freeing the `event`, since it points directly into the event's /// memory. #[no_mangle] pub unsafe extern "C" fn read_dora_input_id( - input: *const (), + event: *const (), out_ptr: *mut *const u8, out_len: *mut usize, ) { - let input: &Input = unsafe { &*input.cast() }; - let id = input.id.as_str().as_bytes(); - let ptr = id.as_ptr(); - let len = id.len(); - unsafe { - *out_ptr = ptr; - *out_len = len; + let event: &Event = unsafe { &*event.cast() }; + match event { + Event::Input { id, .. } => { + let id = id.as_str().as_bytes(); + let ptr = id.as_ptr(); + let len = id.len(); + unsafe { + *out_ptr = ptr; + *out_len = len; + } + } + _ => unsafe { + *out_ptr = ptr::null(); + *out_len = 0; + }, } } -/// Reads out the data of the given input. +/// Reads out the data of the given input event. /// /// Writes the `out_ptr` and `out_len` with the start pointer and length of the /// input's data array. The data array is a raw byte array, whose format /// depends on the source operator/node. /// +/// Writes a null pointer and length `0` if the given event is not an input event +/// or when an input event has no associated data. +/// /// ## Safety /// -/// The `input` argument must be a dora input received through -/// [`dora_next_input`]. The input must be still valid, i.e., not +/// The `event` argument must be a dora event received through +/// [`dora_next_event`]. The event must be still valid, i.e., not /// freed yet. The returned `out_ptr` must not be used after -/// freeing the `input`, since it points directly into the input's +/// freeing the `event`, since it points directly into the event's /// memory. #[no_mangle] pub unsafe extern "C" fn read_dora_input_data( - input: *const (), + event: *const (), out_ptr: *mut *const u8, out_len: *mut usize, ) { - let input: &Input = unsafe { &*input.cast() }; - let data = &input.data(); - let ptr = data.as_ptr(); - let len = data.len(); - unsafe { - *out_ptr = ptr; - *out_len = len; + let event: &Event = unsafe { &*event.cast() }; + match event { + Event::Input { + data: Some(data), .. + } => { + let ptr = data.as_ptr(); + let len = data.len(); + unsafe { + *out_ptr = ptr; + *out_len = len; + } + } + _ => unsafe { + *out_ptr = ptr::null(); + *out_len = 0; + }, } } -/// Frees the given dora input. +/// Frees the given dora event. /// /// ## Safety /// -/// Only pointers created through [`dora_next_input`] are allowed +/// Only pointers created through [`dora_next_event`] are allowed /// as arguments. Each context pointer must be freed exactly once. After /// freeing, the pointer and all derived pointers must not be used anymore. -/// This also applies to the `read_dora_input_*` functions, which return -/// pointers into the original input structure. +/// This also applies to the `read_dora_event_*` functions, which return +/// pointers into the original event structure. #[no_mangle] -pub unsafe extern "C" fn free_dora_input(input: *mut c_void) { - let _: Box = unsafe { Box::from_raw(input.cast()) }; +pub unsafe extern "C" fn free_dora_event(event: *mut c_void) { + let _: Box = unsafe { Box::from_raw(event.cast()) }; } /// Sends the given output to subscribed dora nodes/operators. @@ -194,7 +246,7 @@ unsafe fn try_send_output( let data = unsafe { slice::from_raw_parts(data_ptr, data_len) }; context .node - .send_output(&output_id, Default::default(), data.len(), |out| { + .send_output(output_id, Default::default(), data.len(), |out| { out.copy_from_slice(data); }) } From 3c2bc57473a341ca82470872b87d68856d9f3c3d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 15:57:27 +0100 Subject: [PATCH 056/225] Update C dataflow example for new API --- examples/c-dataflow/dataflow.yml | 14 ++----- examples/c-dataflow/node.c | 39 ++++++++++++------ examples/c-dataflow/operator.c | 70 -------------------------------- examples/c-dataflow/run.rs | 32 +-------------- examples/c-dataflow/sink.c | 49 +++++++++++++++------- 5 files changed, 66 insertions(+), 138 deletions(-) delete mode 100644 examples/c-dataflow/operator.c diff --git a/examples/c-dataflow/dataflow.yml b/examples/c-dataflow/dataflow.yml index b2f30e40..472d7094 100644 --- a/examples/c-dataflow/dataflow.yml +++ b/examples/c-dataflow/dataflow.yml @@ -7,19 +7,11 @@ nodes: custom: source: build/c_node inputs: - timer: dora/timer/secs/1 + timer: dora/timer/millis/50 outputs: - - tick - - id: runtime-node - operators: - - id: c_operator - shared-library: build/operator - inputs: - tick: c_node/tick - outputs: - - counter + - counter - id: c_sink custom: source: build/c_sink inputs: - counter: runtime-node/c_operator/counter + counter: c_node/counter diff --git a/examples/c-dataflow/node.c b/examples/c-dataflow/node.c index eb797ca0..c1f2e307 100644 --- a/examples/c-dataflow/node.c +++ b/examples/c-dataflow/node.c @@ -23,29 +23,44 @@ int main() printf("[c node] dora context initialized\n"); - for (char i = 0; i < 10; i++) + for (char i = 0; i < 100; i++) { printf("[c node] waiting for next input\n"); - void *input = dora_next_input(dora_context); - if (input == NULL) + void *event = dora_next_event(dora_context); + if (event == NULL) { - printf("[c node] ERROR: unexpected end of input\n"); + printf("[c node] ERROR: unexpected end of event\n"); return -1; } - char *data; - size_t data_len; - read_dora_input_data(input, &data, &data_len); + enum EventType ty = read_dora_event_type(event); - assert(data_len == 0); + if (ty == Input) + { + char *data; + size_t data_len; + read_dora_input_data(event, &data, &data_len); + + assert(data_len == 0); - char out_id[] = "tick"; - dora_send_output(dora_context, out_id, strlen(out_id), &i, 1); + char out_id[] = "counter"; + dora_send_output(dora_context, out_id, strlen(out_id), &i, 1); + } + else if (ty == Stop) + { + printf("[c node] received stop event\n"); + free_dora_event(event); + break; + } + else + { + printf("[c node] received unexpected event: %d\n", ty); + } - free_dora_input(input); + free_dora_event(event); } - printf("[c node] received 10 inputs\n"); + printf("[c node] received 10 events\n"); free_dora_context(dora_context); diff --git a/examples/c-dataflow/operator.c b/examples/c-dataflow/operator.c deleted file mode 100644 index 76ef0f4d..00000000 --- a/examples/c-dataflow/operator.c +++ /dev/null @@ -1,70 +0,0 @@ -#include "../../apis/c/operator/operator_api.h" -#include -#include -#include -#include - -DoraInitResult_t dora_init_operator(void) -{ - void *context = malloc(1); - char *context_char = (char *)context; - *context_char = 0; - - DoraInitResult_t result = {.operator_context = context}; - return result; -} - -DoraResult_t dora_drop_operator(void *operator_context) -{ - free(operator_context); - - DoraResult_t result = {}; - return result; -} - -OnInputResult_t dora_on_input( - const Input_t *input, - const SendOutput_t *send_output, - void *operator_context) -{ - char *counter = (char *)operator_context; - - char id[input->id.len + 1]; - memcpy(id, input->id.ptr, input->id.len); - id[input->id.len] = 0; - - if (strcmp(id, "tick") == 0) - { - char data[input->data.len + 1]; - memcpy(data, input->data.ptr, input->data.len); - data[input->data.len] = 0; - - *counter += 1; - printf("C operator received tick input with data `%s`, counter: %i\n", data, *counter); - - char *out_id = "counter"; - char *out_id_heap = strdup(out_id); - - int data_alloc_size = 100; - char *out_data = (char *)malloc(data_alloc_size); - int count = snprintf(out_data, data_alloc_size, "The current counter value is %d", *counter); - assert(count >= 0 && count < 100); - - Output_t output = {.id = { - .ptr = (uint8_t *)out_id_heap, - .len = strlen(out_id_heap), - .cap = strlen(out_id_heap) + 1, - }, - .data = {.ptr = (uint8_t *)out_data, .len = strlen(out_data), .cap = data_alloc_size}}; - DoraResult_t res = (send_output->send_output.call)(send_output->send_output.env_ptr, output); - - OnInputResult_t result = {.result = res, .status = DORA_STATUS_CONTINUE}; - return result; - } - else - { - printf("C operator received unexpected input %s, context: %i\n", id, *counter); - OnInputResult_t result = {.status = DORA_STATUS_CONTINUE}; - return result; - } -} diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index 724729fc..b041b773 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -13,18 +13,12 @@ async fn main() -> eyre::Result<()> { tokio::fs::create_dir_all("build").await?; - build_package("dora-runtime").await?; build_package("dora-node-api-c").await?; - build_package("dora-operator-api-c").await?; build_c_node(root, "node.c", "c_node").await?; build_c_node(root, "sink.c", "c_sink").await?; - build_c_operator().await?; - dora_coordinator::run(dora_coordinator::Args { - run_dataflow: Path::new("dataflow.yml").to_owned().into(), - runtime: Some(root.join("target").join("debug").join("dora-runtime")), - }) - .await?; + let dataflow = Path::new("dataflow.yml").to_owned(); + dora_daemon::Daemon::run_dataflow(&dataflow).await?; Ok(()) } @@ -103,28 +97,6 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( Ok(()) } -async fn build_c_operator() -> eyre::Result<()> { - let mut compile = tokio::process::Command::new("clang"); - compile.arg("-c").arg("operator.c"); - compile.arg("-o").arg("build/operator.o"); - compile.arg("-fdeclspec"); - #[cfg(unix)] - compile.arg("-fPIC"); - if !compile.status().await?.success() { - bail!("failed to compile c operator"); - }; - - let mut link = tokio::process::Command::new("clang"); - link.arg("-shared").arg("build/operator.o"); - link.arg("-o") - .arg(Path::new("build").join(library_filename("operator"))); - if !link.status().await?.success() { - bail!("failed to link c operator"); - }; - - Ok(()) -} - // taken from `rust_libloading` crate by Simonas Kazlauskas, licensed under the ISC license ( // see https://github.com/nagisa/rust_libloading/blob/master/LICENSE) pub fn library_filename>(name: S) -> OsString { diff --git a/examples/c-dataflow/sink.c b/examples/c-dataflow/sink.c index ac15b987..2cd7857c 100644 --- a/examples/c-dataflow/sink.c +++ b/examples/c-dataflow/sink.c @@ -19,28 +19,47 @@ int main() while (1) { printf("[c sink] waiting for next input\n"); - void *input = dora_next_input(dora_context); - if (input == NULL) + void *event = dora_next_event(dora_context); + if (event == NULL) { - printf("[c sink] end of input\n"); + printf("[c sink] end of event\n"); break; } - char *id; - size_t id_len; - read_dora_input_id(input, &id, &id_len); + enum EventType ty = read_dora_event_type(event); - char *data; - size_t data_len; - read_dora_input_data(input, &data, &data_len); + if (ty == Input) + { + char *id; + size_t id_len; + read_dora_input_id(event, &id, &id_len); + + char *data; + size_t data_len; + read_dora_input_data(event, &data, &data_len); - printf("sink received input `"); - fwrite(id, id_len, 1, stdout); - printf("` with data: '"); - fwrite(data, data_len, 1, stdout); - printf("'\n"); + printf("[c sink] received input `"); + fwrite(id, id_len, 1, stdout); + printf("` with data: %d\n", *data); + } + else if (ty == InputClosed) + { + printf("[c sink] received InputClosed event\n"); + free_dora_event(event); + break; + } + else if (ty == Stop) + { + printf("[c sink] received stop event\n"); + free_dora_event(event); + break; + } + else + { + printf("[c sink] received unexpected event: %d\n", ty); + } - free_dora_input(input); + free_dora_event(event); } free_dora_context(dora_context); From 2d1e317b954c43a977f3d292435903166c580977 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 15:57:42 +0100 Subject: [PATCH 057/225] Ignore dora-runtime for now --- Cargo.lock | 51 --------------------------------------------------- Cargo.toml | 5 ++++- 2 files changed, 4 insertions(+), 52 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6462e214..53b9a779 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1146,37 +1146,6 @@ dependencies = [ "safer-ffi", ] -[[package]] -name = "dora-runtime" -version = "0.1.2" -dependencies = [ - "clap 3.2.20", - "dora-core", - "dora-download", - "dora-message", - "dora-metrics", - "dora-node-api", - "dora-operator-api-python", - "dora-operator-api-types", - "dora-tracing", - "eyre", - "fern", - "flume", - "futures", - "futures-concurrency 2.0.3", - "libloading", - "opentelemetry", - "opentelemetry-system-metrics", - "pyo3", - "serde_yaml 0.8.23", - "tokio", - "tokio-stream", - "tracing", - "tracing-subscriber", - "zenoh", - "zenoh-config", -] - [[package]] name = "dora-tracing" version = "0.1.2" @@ -1257,15 +1226,6 @@ dependencies = [ "instant", ] -[[package]] -name = "fern" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd7b0849075e79ee9a1836df22c717d1eba30451796fdc631b04565dd11e2a" -dependencies = [ - "log", -] - [[package]] name = "fixedbitset" version = "0.4.1" @@ -1357,17 +1317,6 @@ dependencies = [ "futures-sink", ] -[[package]] -name = "futures-concurrency" -version = "2.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e98b7b5aedee7c34a5cfb1ee1681af8faf46e2f30c0b8af5ea08eba517d61c" -dependencies = [ - "async-trait", - "futures-core", - "pin-project", -] - [[package]] name = "futures-concurrency" version = "5.0.1" diff --git a/Cargo.toml b/Cargo.toml index 8c6c1eca..4e79b0d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,10 @@ members = [ "apis/rust/*", "apis/rust/operator/macros", "apis/rust/operator/types", - "binaries/*", + "binaries/cli", + "binaries/coordinator", + "binaries/daemon", + # "binaries/runtime", "examples/rust-dataflow/*", "examples/iceoryx/*", "libraries/communication-layer/*", From a406d2e3ccb38a62938a5f86fd0559effcd54ff3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 17:44:13 +0100 Subject: [PATCH 058/225] Update C++ API for new daemon design --- Cargo.lock | 12 +++--- apis/c++/node/src/lib.rs | 80 ++++++++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 53b9a779..6ce13d84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -765,9 +765,9 @@ checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" [[package]] name = "cxx" -version = "1.0.73" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "873c2e83af70859af2aaecd1f5d862f3790b747b1f4f50fb45a931d000ac0422" +checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" dependencies = [ "cc", "cxxbridge-flags", @@ -792,15 +792,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.73" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46b787c15af80277db5c88c6ac6c502ae545e622f010e06f95e540d34931acf" +checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" [[package]] name = "cxxbridge-macro" -version = "1.0.73" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ba3f3a7efa46626878fb5d324fabca4d19d2956b6ae97ce43044ef4515f5abc" +checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" dependencies = [ "proc-macro2", "quote", diff --git a/apis/c++/node/src/lib.rs b/apis/c++/node/src/lib.rs index a963017e..b60f0fda 100644 --- a/apis/c++/node/src/lib.rs +++ b/apis/c++/node/src/lib.rs @@ -1,14 +1,27 @@ -use dora_node_api::{self, Input, Receiver}; +use dora_node_api::{ + self, + daemon::{Event, EventStream}, +}; +use eyre::bail; #[cxx::bridge] +#[allow(clippy::needless_lifetimes)] mod ffi { struct DoraNode { - inputs: Box, + events: Box, send_output: Box, } + pub enum DoraEventType { + Stop, + Input, + InputClosed, + Error, + Unknown, + AllInputsClosed, + } + struct DoraInput { - end_of_input: bool, id: String, data: Vec, } @@ -18,12 +31,16 @@ mod ffi { } extern "Rust" { - type Inputs; + type Events; type OutputSender; + type DoraEvent<'a>; fn init_dora_node() -> Result; fn free_dora_node(node: DoraNode); - fn next_input(inputs: &mut Box) -> DoraInput; + + fn next_event(inputs: &mut Box) -> Box>; + fn event_type(event: &Box) -> DoraEventType; + fn event_as_input(event: Box) -> Result; fn send_output( output_sender: &mut Box, id: String, @@ -33,13 +50,12 @@ mod ffi { } fn init_dora_node() -> eyre::Result { - let mut node = dora_node_api::DoraNode::init_from_env()?; - let input_stream = node.inputs()?; - let inputs = Inputs(input_stream); + let (node, events) = dora_node_api::DoraNode::init_from_env()?; + let inputs = Events(events); let send_output = OutputSender(node); Ok(ffi::DoraNode { - inputs: Box::new(inputs), + events: Box::new(inputs), send_output: Box::new(send_output), }) } @@ -48,33 +64,43 @@ fn free_dora_node(node: ffi::DoraNode) { let _ = node; } -pub struct Inputs(Receiver); - -fn next_input(inputs: &mut Box) -> ffi::DoraInput { - match inputs.0.recv() { - Ok(input) => { - let id = input.id.clone().into(); - let data = input.data(); - ffi::DoraInput { - end_of_input: false, - id, - data: data.into_owned(), - } - } - Err(_) => ffi::DoraInput { - end_of_input: true, - id: String::new(), - data: Vec::new(), +pub struct Events(EventStream); + +fn next_event(events: &mut Box) -> Box { + Box::new(DoraEvent(events.0.recv())) +} + +pub struct DoraEvent<'a>(Option>); + +fn event_type(event: &Box) -> ffi::DoraEventType { + match &event.0 { + Some(event) => match event { + Event::Stop => ffi::DoraEventType::Stop, + Event::Input { .. } => ffi::DoraEventType::Input, + Event::InputClosed { .. } => ffi::DoraEventType::InputClosed, + Event::Error(_) => ffi::DoraEventType::Error, + _ => ffi::DoraEventType::Unknown, }, + None => ffi::DoraEventType::AllInputsClosed, } } +fn event_as_input(event: Box) -> eyre::Result { + let Some(Event::Input { id, metadata: _, data }) = event.0 else { + bail!("not an input event"); + }; + Ok(ffi::DoraInput { + id: id.into(), + data: data.map(|d| d.to_owned()).unwrap_or_default(), + }) +} + pub struct OutputSender(dora_node_api::DoraNode); fn send_output(sender: &mut Box, id: String, data: &[u8]) -> ffi::DoraResult { let result = sender .0 - .send_output(&id.into(), Default::default(), data.len(), |out| { + .send_output(id.into(), Default::default(), data.len(), |out| { out.copy_from_slice(data) }); let error = match result { From c2fe771cbaf53c1eb033c8609a07c4f5f056cdf9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 17:44:40 +0100 Subject: [PATCH 059/225] Start updating C++ example for new design --- examples/c++-dataflow/dataflow.yml | 44 +++++----- examples/c++-dataflow/node-c-api/main.cc | 71 ---------------- examples/c++-dataflow/node-rust-api/main.cc | 32 +++++--- examples/c++-dataflow/run.rs | 89 ++++++++------------- 4 files changed, 79 insertions(+), 157 deletions(-) delete mode 100644 examples/c++-dataflow/node-c-api/main.cc diff --git a/examples/c++-dataflow/dataflow.yml b/examples/c++-dataflow/dataflow.yml index 960866e6..02f1d4c3 100644 --- a/examples/c++-dataflow/dataflow.yml +++ b/examples/c++-dataflow/dataflow.yml @@ -10,26 +10,26 @@ nodes: tick: dora/timer/millis/300 outputs: - counter - - id: cxx-node-c-api - custom: - source: build/node_c_api - inputs: - tick: dora/timer/millis/300 - outputs: - - counter + # - id: cxx-node-c-api + # custom: + # source: build/node_c_api + # inputs: + # tick: dora/timer/millis/300 + # outputs: + # - counter - - id: runtime-node - operators: - - id: operator-rust-api - shared-library: build/operator_rust_api - inputs: - counter_1: cxx-node-c-api/counter - counter_2: cxx-node-rust-api/counter - outputs: - - status - - id: operator-c-api - shared-library: build/operator_c_api - inputs: - op_status: runtime-node/operator-rust-api/status - outputs: - - half-status + # - id: runtime-node + # operators: + # - id: operator-rust-api + # shared-library: build/operator_rust_api + # inputs: + # counter_1: cxx-node-c-api/counter + # counter_2: cxx-node-rust-api/counter + # outputs: + # - status + # - id: operator-c-api + # shared-library: build/operator_c_api + # inputs: + # op_status: runtime-node/operator-rust-api/status + # outputs: + # - half-status diff --git a/examples/c++-dataflow/node-c-api/main.cc b/examples/c++-dataflow/node-c-api/main.cc deleted file mode 100644 index d4fb7cad..00000000 --- a/examples/c++-dataflow/node-c-api/main.cc +++ /dev/null @@ -1,71 +0,0 @@ -extern "C" -{ -#include "../../../apis/c/node/node_api.h" -} - -#include -#include - -int run(void *dora_context) -{ - unsigned char counter = 0; - - for (int i = 0; i < 20; i++) - { - - auto input = dora_next_input(dora_context); - if (input == NULL) - { - return 0; // end of input - } - counter += 1; - - char *id_ptr; - size_t id_len; - read_dora_input_id(input, &id_ptr, &id_len); - std::string id(id_ptr, id_len); - - char *data_ptr; - size_t data_len; - read_dora_input_data(input, &data_ptr, &data_len); - std::vector data; - for (size_t i = 0; i < data_len; i++) - { - data.push_back(*(data_ptr + i)); - } - - std::cout - << "Received input " - << " (counter: " << (unsigned int)counter << ") data: ["; - for (unsigned char &v : data) - { - std::cout << (unsigned int)v << ", "; - } - std::cout << "]" << std::endl; - - free_dora_input(input); - - std::vector out_vec{counter}; - - std::string out_id = "counter"; - - int result = dora_send_output(dora_context, &out_id[0], out_id.length(), (char *)&counter, 1); - if (result != 0) - { - std::cerr << "failed to send output" << std::endl; - return 1; - } - } - return 0; -} - -int main() -{ - std::cout << "HELLO FROM C++ (using C API)" << std::endl; - - auto dora_context = init_dora_context_from_env(); - auto ret = run(dora_context); - free_dora_context(dora_context); - - return ret; -} diff --git a/examples/c++-dataflow/node-rust-api/main.cc b/examples/c++-dataflow/node-rust-api/main.cc index 3490d5a3..df8542ca 100644 --- a/examples/c++-dataflow/node-rust-api/main.cc +++ b/examples/c++-dataflow/node-rust-api/main.cc @@ -13,22 +13,34 @@ int main() for (int i = 0; i < 20; i++) { - auto input = next_input(dora_node.inputs); - if (input.end_of_input) + auto event = next_event(dora_node.events); + auto ty = event_type(event); + + if (ty == DoraEventType::AllInputsClosed) { break; } - counter += 1; + else if (ty == DoraEventType::Input) + { + auto input = event_as_input(std::move(event)); - std::cout << "Received input " << std::string(input.id) << " (counter: " << (unsigned int)counter << ")" << std::endl; + counter += 1; - std::vector out_vec{counter}; - rust::Slice out_slice{out_vec.data(), out_vec.size()}; - auto result = send_output(dora_node.send_output, "counter", out_slice); - auto error = std::string(result.error); - if (!error.empty()) + std::cout << "Received input " << std::string(input.id) << " (counter: " << (unsigned int)counter << ")" << std::endl; + + std::vector out_vec{counter}; + rust::Slice out_slice{out_vec.data(), out_vec.size()}; + auto result = send_output(dora_node.send_output, "counter", out_slice); + auto error = std::string(result.error); + if (!error.empty()) + { + std::cerr << "Error: " << error << std::endl; + return -1; + } + } + else { - std::cerr << "Error: " << error << std::endl; + std::cerr << "Unknown event type " << static_cast(ty) << std::endl; return -1; } } diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index b76c52d3..a6c3c285 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -20,22 +20,6 @@ async fn main() -> eyre::Result<()> { tokio::fs::create_dir_all("build").await?; let build_dir = Path::new("build"); - build_package("dora-operator-api-cxx").await?; - let operator_cxxbridge = target - .join("cxxbridge") - .join("dora-operator-api-cxx") - .join("src"); - tokio::fs::copy( - operator_cxxbridge.join("lib.rs.cc"), - build_dir.join("operator-bridge.cc"), - ) - .await?; - tokio::fs::copy( - operator_cxxbridge.join("lib.rs.h"), - build_dir.join("dora-operator-api.h"), - ) - .await?; - build_package("dora-node-api-cxx").await?; let node_cxxbridge = target .join("cxxbridge") @@ -58,7 +42,7 @@ async fn main() -> eyre::Result<()> { .await?; build_package("dora-node-api-c").await?; - build_package("dora-operator-api-c").await?; + // build_package("dora-operator-api-c").await?; build_cxx_node( root, &[ @@ -69,45 +53,42 @@ async fn main() -> eyre::Result<()> { &["-l", "dora_node_api_cxx"], ) .await?; - build_cxx_node( - root, - &[&dunce::canonicalize( - Path::new("node-c-api").join("main.cc"), - )?], - "node_c_api", - &["-l", "dora_node_api_c"], - ) - .await?; - build_cxx_operator( - &[ - &dunce::canonicalize(Path::new("operator-rust-api").join("operator.cc"))?, - &dunce::canonicalize(build_dir.join("operator-bridge.cc"))?, - ], - "operator_rust_api", - &[ - "-l", - "dora_operator_api_cxx", - "-L", - &root.join("target").join("debug").to_str().unwrap(), - ], - ) - .await?; - build_cxx_operator( - &[&dunce::canonicalize( - Path::new("operator-c-api").join("operator.cc"), - )?], - "operator_c_api", - &[], - ) - .await?; + // build_cxx_node( + // root, + // &[&dunce::canonicalize( + // Path::new("node-c-api").join("main.cc"), + // )?], + // "node_c_api", + // &["-l", "dora_node_api_c"], + // ) + // .await?; + // build_cxx_operator( + // &[ + // &dunce::canonicalize(Path::new("operator-rust-api").join("operator.cc"))?, + // &dunce::canonicalize(build_dir.join("operator-bridge.cc"))?, + // ], + // "operator_rust_api", + // &[ + // "-l", + // "dora_operator_api_cxx", + // "-L", + // &root.join("target").join("debug").to_str().unwrap(), + // ], + // ) + // .await?; + // build_cxx_operator( + // &[&dunce::canonicalize( + // Path::new("operator-c-api").join("operator.cc"), + // )?], + // "operator_c_api", + // &[], + // ) + // .await?; - build_package("dora-runtime").await?; + // build_package("dora-runtime").await?; - dora_coordinator::run(dora_coordinator::Args { - run_dataflow: Path::new("dataflow.yml").to_owned().into(), - runtime: Some(root.join("target").join("debug").join("dora-runtime")), - }) - .await?; + let dataflow = Path::new("dataflow.yml").to_owned(); + dora_daemon::Daemon::run_dataflow(&dataflow).await?; Ok(()) } From 0405f22ca2b5f9ffa3ad9628e7aa00e9e281b3d2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 17:45:47 +0100 Subject: [PATCH 060/225] Remove iceoryx example (we no longer need iceoryx) --- Cargo.lock | 26 --------------- Cargo.toml | 5 --- examples/iceoryx/dataflow.yml | 26 --------------- examples/iceoryx/node/Cargo.toml | 11 ------- examples/iceoryx/node/src/main.rs | 34 -------------------- examples/iceoryx/operator/Cargo.toml | 13 -------- examples/iceoryx/operator/src/lib.rs | 47 ---------------------------- examples/iceoryx/run.rs | 33 ------------------- examples/iceoryx/sink/Cargo.toml | 12 ------- examples/iceoryx/sink/src/main.rs | 28 ----------------- 10 files changed, 235 deletions(-) delete mode 100644 examples/iceoryx/dataflow.yml delete mode 100644 examples/iceoryx/node/Cargo.toml delete mode 100644 examples/iceoryx/node/src/main.rs delete mode 100644 examples/iceoryx/operator/Cargo.toml delete mode 100644 examples/iceoryx/operator/src/lib.rs delete mode 100644 examples/iceoryx/run.rs delete mode 100644 examples/iceoryx/sink/Cargo.toml delete mode 100644 examples/iceoryx/sink/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 6ce13d84..ad627541 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1672,32 +1672,6 @@ dependencies = [ "tokio-native-tls", ] -[[package]] -name = "iceoryx-example-node" -version = "0.1.2" -dependencies = [ - "dora-node-api", - "eyre", - "rand", -] - -[[package]] -name = "iceoryx-example-operator" -version = "0.1.2" -dependencies = [ - "dora-operator-api", -] - -[[package]] -name = "iceoryx-example-sink" -version = "0.1.2" -dependencies = [ - "dora-node-api", - "eyre", - "futures", - "tokio", -] - [[package]] name = "iceoryx-rs" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 4e79b0d9..2de4e586 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,6 @@ members = [ "binaries/daemon", # "binaries/runtime", "examples/rust-dataflow/*", - "examples/iceoryx/*", "libraries/communication-layer/*", "libraries/core", "libraries/message", @@ -66,7 +65,3 @@ path = "examples/c++-dataflow/run.rs" [[example]] name = "python-dataflow" path = "examples/python-dataflow/run.rs" - -[[example]] -name = "iceoryx" -path = "examples/iceoryx/run.rs" diff --git a/examples/iceoryx/dataflow.yml b/examples/iceoryx/dataflow.yml deleted file mode 100644 index 56b0face..00000000 --- a/examples/iceoryx/dataflow.yml +++ /dev/null @@ -1,26 +0,0 @@ -communication: - iceoryx: - app_name_prefix: dora-iceoryx-example - -nodes: - - id: rust-node - custom: - source: ../../target/debug/iceoryx-example-node - inputs: - tick: dora/timer/millis/300 - outputs: - - random - - id: runtime-node - operators: - - id: rust-operator - shared-library: ../../target/debug/iceoryx_example_operator - inputs: - tick: dora/timer/millis/100 - random: rust-node/random - outputs: - - status - - id: rust-sink - custom: - source: ../../target/debug/iceoryx-example-sink - inputs: - message: runtime-node/rust-operator/status diff --git a/examples/iceoryx/node/Cargo.toml b/examples/iceoryx/node/Cargo.toml deleted file mode 100644 index 7ecdc899..00000000 --- a/examples/iceoryx/node/Cargo.toml +++ /dev/null @@ -1,11 +0,0 @@ -[package] -name = "iceoryx-example-node" -version.workspace = true -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -dora-node-api = { workspace = true } -eyre = "0.6.8" -rand = "0.8.5" diff --git a/examples/iceoryx/node/src/main.rs b/examples/iceoryx/node/src/main.rs deleted file mode 100644 index 7e8f92fb..00000000 --- a/examples/iceoryx/node/src/main.rs +++ /dev/null @@ -1,34 +0,0 @@ -use dora_node_api::{self, dora_core::config::DataId, DoraNode}; - -fn main() -> eyre::Result<()> { - let output = DataId::from("random".to_owned()); - - let mut operator = DoraNode::init_from_env()?; - - let inputs = operator.inputs()?; - - for _ in 0..20 { - let input = match inputs.recv() { - Ok(input) => input, - Err(_) => break, - }; - - match input.id.as_str() { - "tick" => { - let random: u64 = rand::random(); - let data: &[u8] = &random.to_le_bytes(); - operator.send_output( - &output, - input.metadata().parameters.clone(), - data.len(), - |out| { - out.copy_from_slice(data); - }, - )?; - } - other => eprintln!("Ignoring unexpected input `{other}`"), - } - } - - Ok(()) -} diff --git a/examples/iceoryx/operator/Cargo.toml b/examples/iceoryx/operator/Cargo.toml deleted file mode 100644 index 6b1368e6..00000000 --- a/examples/iceoryx/operator/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "iceoryx-example-operator" -version.workspace = true -edition = "2021" -license = "Apache-2.0" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -crate-type = ["cdylib"] - -[dependencies] -dora-operator-api = { path = "../../../apis/rust/operator" } diff --git a/examples/iceoryx/operator/src/lib.rs b/examples/iceoryx/operator/src/lib.rs deleted file mode 100644 index 3c0713a1..00000000 --- a/examples/iceoryx/operator/src/lib.rs +++ /dev/null @@ -1,47 +0,0 @@ -#![warn(unsafe_op_in_unsafe_fn)] - -use dora_operator_api::{register_operator, DoraOperator, DoraOutputSender, DoraStatus}; -use std::time::{Duration, Instant}; - -register_operator!(ExampleOperator); - -#[derive(Debug, Default)] -struct ExampleOperator { - ticks: usize, - last_random_at: Option, -} - -impl DoraOperator for ExampleOperator { - fn on_input( - &mut self, - id: &str, - data: &[u8], - output_sender: &mut DoraOutputSender, - ) -> Result { - match id { - "tick" => { - self.ticks += 1; - } - "random" => { - let parsed = { - let data: [u8; 8] = data.try_into().map_err(|_| "unexpected random data")?; - u64::from_le_bytes(data) - }; - let output = format!( - "operator received random value {parsed} after {} ticks", - self.ticks - ); - output_sender.send("status".into(), output.into_bytes())?; - self.last_random_at = Some(Instant::now()); - } - other => eprintln!("ignoring unexpected input {other}"), - } - if let Some(last_random_at) = self.last_random_at { - if last_random_at.elapsed() > Duration::from_secs(1) { - // looks like the node sending the random values finished -> exit too - return Ok(DoraStatus::Stop); - } - } - Ok(DoraStatus::Continue) - } -} diff --git a/examples/iceoryx/run.rs b/examples/iceoryx/run.rs deleted file mode 100644 index 87a7d6fb..00000000 --- a/examples/iceoryx/run.rs +++ /dev/null @@ -1,33 +0,0 @@ -use eyre::{bail, Context}; -use std::path::Path; - -#[tokio::main] -async fn main() -> eyre::Result<()> { - let root = Path::new(env!("CARGO_MANIFEST_DIR")); - std::env::set_current_dir(root.join(file!()).parent().unwrap()) - .wrap_err("failed to set working dir")?; - - build_package("iceoryx-example-node").await?; - build_package("iceoryx-example-operator").await?; - build_package("iceoryx-example-sink").await?; - build_package("dora-runtime").await?; - - dora_coordinator::run(dora_coordinator::Args { - run_dataflow: Path::new("dataflow.yml").to_owned().into(), - runtime: Some(root.join("target").join("debug").join("dora-runtime")), - }) - .await?; - - Ok(()) -} - -async fn build_package(package: &str) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("build"); - cmd.arg("--package").arg(package); - if !cmd.status().await?.success() { - bail!("failed to build {package}"); - }; - Ok(()) -} diff --git a/examples/iceoryx/sink/Cargo.toml b/examples/iceoryx/sink/Cargo.toml deleted file mode 100644 index ae81e625..00000000 --- a/examples/iceoryx/sink/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "iceoryx-example-sink" -version.workspace = true -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -dora-node-api = { workspace = true } -eyre = "0.6.8" -futures = "0.3.21" -tokio = { version = "1.20.1", features = ["macros"] } diff --git a/examples/iceoryx/sink/src/main.rs b/examples/iceoryx/sink/src/main.rs deleted file mode 100644 index f9c932a5..00000000 --- a/examples/iceoryx/sink/src/main.rs +++ /dev/null @@ -1,28 +0,0 @@ -use dora_node_api::{self, DoraNode}; -use eyre::{bail, Context}; - -fn main() -> eyre::Result<()> { - let mut operator = DoraNode::init_from_env()?; - - let inputs = operator.inputs()?; - - while let Ok(input) = inputs.recv() { - match input.id.as_str() { - "message" => { - let data = input.data(); - let received_string = - std::str::from_utf8(&data).wrap_err("received message was not utf8-encoded")?; - println!("received message: {}", received_string); - if !received_string.starts_with("operator received random value ") { - bail!("unexpected message format (should start with 'operator received random value')") - } - if !received_string.ends_with(" ticks") { - bail!("unexpected message format (should end with 'ticks')") - } - } - other => eprintln!("Ignoring unexpected input `{other}`"), - } - } - - Ok(()) -} From bb2e8c2920ecce85691c7ced6f4b0d45361bae3f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 17:47:03 +0100 Subject: [PATCH 061/225] Remove iceoryx from CI jobs --- .github/workflows/ci.yml | 9 --------- .github/workflows/release.yml | 3 --- 2 files changed, 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9e5624a..056953ee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,15 +67,6 @@ jobs: timeout-minutes: 15 run: cargo run --example cxx-dataflow - - name: "Start RouDi (iceoryx)" - if: runner.os != 'Windows' - run: find target -type f -wholename "*/iceoryx-install/bin/iox-roudi" -exec {} \; & - - - name: "Rust iceoryx example" - if: runner.os != 'Windows' - timeout-minutes: 30 - run: cargo run --example iceoryx - examples-remote: name: "Examples (Remote)" runs-on: ubuntu-latest diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0a882afe..41c2d6fd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -41,9 +41,6 @@ jobs: cp target/release/dora-runtime archive cp target/release/dora-coordinator archive cp target/release/dora-cli archive/dora - mkdir archive/iceoryx - find target -type f -wholename "*/iceoryx-install/bin/iox-roudi" -exec cp {} archive/iceoryx \; - find target -type f -wholename "*/iceoryx-install/share/doc/iceoryx_posh/LICENSE" -exec cp {} archive/iceoryx \; cd archive zip -r ../archive.zip . cd .. From 63444b9760d65f1a4c38badd9753ed3efb88731e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 28 Dec 2022 17:57:47 +0100 Subject: [PATCH 062/225] Fix CI failures caused by `rust-dataflow-url` example --- .github/workflows/ci.yml | 1 + examples/rust-dataflow-url/run.rs | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 056953ee..da37b506 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,6 +84,7 @@ jobs: - uses: Swatinem/rust-cache@v2 - name: "Remote Rust Dataflow example" + if: false # skip this example for now until we uploaded new test nodes timeout-minutes: 30 run: cargo run --example rust-dataflow-url diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index 9378905c..7d0698a3 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -9,13 +9,8 @@ async fn main() -> eyre::Result<()> { let dataflow = Path::new("dataflow.yml"); build_dataflow(dataflow).await?; - build_package("dora-runtime").await?; - dora_coordinator::run(dora_coordinator::Args { - run_dataflow: dataflow.to_owned().into(), - runtime: Some(root.join("target").join("debug").join("dora-runtime")), - }) - .await?; + dora_daemon::Daemon::run_dataflow(dataflow).await?; Ok(()) } From 07311fd6110e43f762ad0891b39cdd2ab336fa7c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 09:45:08 +0100 Subject: [PATCH 063/225] Check exit status of all nodes when using `exit_when_done` Instead of finishing already when all nodes sent their `Stopped` message, we now check the exit code of the node executables too. --- binaries/daemon/src/lib.rs | 64 ++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 6619060b..1dee4a1e 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -50,7 +50,7 @@ pub struct Daemon { machine_id: String, /// used for testing and examples - exit_when_done: Option>, + exit_when_done: Option>, } impl Daemon { @@ -94,7 +94,11 @@ impl Daemon { nodes: custom_nodes, }; - let exit_when_done = [spawn_command.dataflow_id].into(); + let exit_when_done = spawn_command + .nodes + .iter() + .map(|(id, _)| (spawn_command.dataflow_id, id.clone())) + .collect(); let (reply_tx, reply_rx) = oneshot::channel(); let coordinator_events = stream::once(async move { Event::Coordinator(CoordinatorEvent { @@ -126,7 +130,7 @@ impl Daemon { external_events: impl Stream + Unpin, coordinator_addr: Option, machine_id: String, - exit_when_done: Option>, + exit_when_done: Option>, ) -> eyre::Result<()> { // create listener for node connection let listener = listener::create_listener().await?; @@ -187,15 +191,13 @@ impl Daemon { event, reply_sender, } => { - match self - .handle_node_event(event, dataflow, node_id, reply_sender) + self.handle_node_event(event, dataflow, node_id, reply_sender) .await? - { - RunStatus::Continue => {} - RunStatus::Exit => break, - } } - Event::Dora(event) => self.handle_dora_event(event).await?, + Event::Dora(event) => match self.handle_dora_event(event).await? { + RunStatus::Continue => {} + RunStatus::Exit => break, + }, Event::Drop(DropEvent { token }) => { match self.sent_out_shared_memory.remove(&token) { Some(rc) => { @@ -309,7 +311,7 @@ impl Daemon { dataflow_id: DataflowId, node_id: NodeId, reply_sender: oneshot::Sender, - ) -> eyre::Result { + ) -> eyre::Result<()> { match event { DaemonNodeEvent::Subscribe { event_sender } => { let result = match self.running.get_mut(&dataflow_id) { @@ -467,23 +469,13 @@ impl Daemon { } } self.running.remove(&dataflow_id); - - if let Some(exit_when_done) = &mut self.exit_when_done { - exit_when_done.remove(&dataflow_id); - if exit_when_done.is_empty() { - tracing::info!( - "exiting daemon because all required dataflows are finished" - ); - return Ok(RunStatus::Exit); - } - } } } } - Ok(RunStatus::Continue) + Ok(()) } - async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { + async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result { match event { DoraEvent::Timer { dataflow_id, @@ -492,11 +484,11 @@ impl Daemon { } => { let Some(dataflow) = self.running.get_mut(&dataflow_id) else { tracing::warn!("Timer event for unknown dataflow `{dataflow_id}`"); - return Ok(()) + return Ok(RunStatus::Continue); }; let Some(subscribers) = dataflow.timers.get(&interval) else { - return Ok(()); + return Ok(RunStatus::Continue); }; let mut closed = Vec::new(); @@ -543,18 +535,30 @@ impl Daemon { } match result { Ok(()) => { - tracing::info!("node {dataflow_id}/{node_id} finished"); + tracing::info!("node {dataflow_id}/{node_id} finished successfully"); } Err(err) => { - tracing::error!( - "{:?}", - err.wrap_err(format!("error in node `{dataflow_id}/{node_id}`")) + let err = err.wrap_err(format!("error in node `{dataflow_id}/{node_id}`")); + if self.exit_when_done.is_some() { + bail!(err); + } else { + tracing::error!("{err:?}",); + } + } + } + + if let Some(exit_when_done) = &mut self.exit_when_done { + exit_when_done.remove(&(dataflow_id, node_id)); + if exit_when_done.is_empty() { + tracing::info!( + "exiting daemon because all required dataflows are finished" ); + return Ok(RunStatus::Exit); } } } } - Ok(()) + Ok(RunStatus::Continue) } } From 619d0bb34b629c431ffcb596677a816e5a4ed21e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 09:57:54 +0100 Subject: [PATCH 064/225] Fix: limit slice length to requested length On Windows, the size of the shared memory region is rounded up to the next multiple of the page size. By slicing the region to the expected size we allow nodes to assume that they get exactly the requsted size. --- apis/rust/node/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 3919fcda..51b97aa4 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -82,7 +82,7 @@ impl DoraNode { .wrap_err("failed to open shared memory sample")?; let raw = unsafe { shared_memory.as_slice_mut() }; - data(raw); + data(&mut raw[..data_len]); } else { data(&mut []); } From 778d9d7eb7a3bf7a65a923d4dd369aa2bffc4b7a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 10:25:39 +0100 Subject: [PATCH 065/225] Fix: Use correct length for received data The shared memory region size might be larger because of padding. --- apis/rust/node/src/daemon.rs | 8 +++--- binaries/daemon/src/lib.rs | 35 +++++++++++++++++++-------- libraries/core/src/daemon_messages.rs | 1 + 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 43eb709c..03a272f3 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -216,7 +216,7 @@ impl EventStream { NodeEvent::InputClosed { id } => Event::InputClosed { id }, NodeEvent::Input { id, metadata, data } => { let mapped = data - .map(|d| unsafe { MappedInputData::map(&d.shared_memory_id) }) + .map(|d| unsafe { MappedInputData::map(&d.shared_memory_id, d.len) }) .transpose(); match mapped { Ok(mapped) => Event::Input { @@ -276,17 +276,19 @@ impl std::fmt::Debug for Data<'_> { pub struct MappedInputData<'a> { memory: Shmem, + len: usize, _data: PhantomData<&'a [u8]>, } impl MappedInputData<'_> { - unsafe fn map(shared_memory_id: &str) -> eyre::Result { + unsafe fn map(shared_memory_id: &str, len: usize) -> eyre::Result { let memory = ShmemConf::new() .os_id(shared_memory_id) .open() .wrap_err("failed to map shared memory input")?; Ok(MappedInputData { memory, + len, _data: PhantomData, }) } @@ -296,7 +298,7 @@ impl std::ops::Deref for MappedInputData<'_> { type Target = [u8]; fn deref(&self) -> &Self::Target { - unsafe { self.memory.as_slice() } + unsafe { &self.memory.as_slice()[..self.len] } } } diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 1dee4a1e..39582fac 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -39,7 +39,7 @@ mod tcp_utils; pub struct Daemon { port: u16, - prepared_messages: HashMap, Option)>, + prepared_messages: HashMap, sent_out_shared_memory: HashMap>, running: HashMap, @@ -342,8 +342,12 @@ impl Daemon { .as_ref() .map(|m| m.get_os_id().to_owned()) .unwrap_or_else(|| Uuid::new_v4().to_string()); - self.prepared_messages - .insert(id.clone(), (output_id, metadata, memory)); + let message = PreparedMessage { + output_id, + metadata, + data: memory.map(|m| (m, data_len)), + }; + self.prepared_messages.insert(id.clone(), message); let reply = ControlReply::PreparedMessage { shared_memory_id: id.clone(), @@ -354,12 +358,16 @@ impl Daemon { } } DaemonNodeEvent::SendOutMessage { id } => { - let (output_id, metadata, memory) = self + let message = self .prepared_messages .remove(&id) .ok_or_else(|| eyre!("invalid shared memory id"))?; - - let memory = memory.map(Rc::new); + let PreparedMessage { + output_id, + metadata, + data, + } = message; + let data = data.map(|(m, len)| (Rc::new(m), len)); let dataflow = self .running @@ -381,8 +389,9 @@ impl Daemon { let send_result = channel.send_async(daemon_messages::NodeEvent::Input { id: input_id.clone(), metadata: metadata.clone(), - data: memory.as_ref().map(|m| daemon_messages::InputData { + data: data.as_ref().map(|(m, len)| daemon_messages::InputData { shared_memory_id: m.get_os_id().to_owned(), + len: *len, drop_token: drop_token.clone(), }), }); @@ -390,7 +399,7 @@ impl Daemon { match timeout(Duration::from_millis(10), send_result).await { Ok(Ok(())) => { // keep shared memory ptr in order to free it once all subscribers are done - if let Some(memory) = &memory { + if let Some((memory, _)) = &data { self.sent_out_shared_memory .insert(drop_token, memory.clone()); } @@ -411,8 +420,8 @@ impl Daemon { } // TODO send `data` via network to all remove receivers - if let Some(memory) = &memory { - let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), memory.len()); + if let Some((memory, len)) = &data { + let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), *len); } let _ = reply_sender.send(ControlReply::Result(Ok(()))); @@ -562,6 +571,12 @@ impl Daemon { } } +struct PreparedMessage { + output_id: DataId, + metadata: dora_message::Metadata<'static>, + data: Option<(Shmem, usize)>, +} + #[derive(Default)] pub struct RunningDataflow { subscribe_channels: HashMap>, diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 68a81327..2d00e5bc 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -76,6 +76,7 @@ impl DropToken { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct InputData { pub shared_memory_id: SharedMemoryId, + pub len: usize, pub drop_token: DropToken, } From 20ffe50389c680db473abcc60e4a077b2cd904ee Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 10:30:36 +0100 Subject: [PATCH 066/225] Use prefixed names for dora event enum variants in C API to avoid name conflicts --- apis/c/node/node_api.h | 14 +++++++------- examples/c-dataflow/node.c | 6 +++--- examples/c-dataflow/sink.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/apis/c/node/node_api.h b/apis/c/node/node_api.h index 4ad43888..50890876 100644 --- a/apis/c/node/node_api.h +++ b/apis/c/node/node_api.h @@ -6,15 +6,15 @@ void free_dora_context(void *dora_context); void *dora_next_event(void *dora_context); void free_dora_event(void *dora_event); -enum EventType +enum DoraEventType { - Stop, - Input, - InputClosed, - Error, - Unknown, + DoraEventType_Stop, + DoraEventType_Input, + DoraEventType_InputClosed, + DoraEventType_Error, + DoraEventType_Unknown, }; -enum EventType read_dora_event_type(void *dora_event); +enum DoraEventType read_dora_event_type(void *dora_event); void read_dora_input_id(void *dora_event, char **out_ptr, size_t *out_len); void read_dora_input_data(void *dora_event, char **out_ptr, size_t *out_len); diff --git a/examples/c-dataflow/node.c b/examples/c-dataflow/node.c index c1f2e307..ff8949ae 100644 --- a/examples/c-dataflow/node.c +++ b/examples/c-dataflow/node.c @@ -33,9 +33,9 @@ int main() return -1; } - enum EventType ty = read_dora_event_type(event); + enum DoraEventType ty = read_dora_event_type(event); - if (ty == Input) + if (ty == DoraEventType_Input) { char *data; size_t data_len; @@ -46,7 +46,7 @@ int main() char out_id[] = "counter"; dora_send_output(dora_context, out_id, strlen(out_id), &i, 1); } - else if (ty == Stop) + else if (ty == DoraEventType_Stop) { printf("[c node] received stop event\n"); free_dora_event(event); diff --git a/examples/c-dataflow/sink.c b/examples/c-dataflow/sink.c index 2cd7857c..486bab01 100644 --- a/examples/c-dataflow/sink.c +++ b/examples/c-dataflow/sink.c @@ -26,9 +26,9 @@ int main() break; } - enum EventType ty = read_dora_event_type(event); + enum DoraEventType ty = read_dora_event_type(event); - if (ty == Input) + if (ty == DoraEventType_Input) { char *id; size_t id_len; @@ -42,13 +42,13 @@ int main() fwrite(id, id_len, 1, stdout); printf("` with data: %d\n", *data); } - else if (ty == InputClosed) + else if (ty == DoraEventType_InputClosed) { printf("[c sink] received InputClosed event\n"); free_dora_event(event); break; } - else if (ty == Stop) + else if (ty == DoraEventType_Stop) { printf("[c sink] received stop event\n"); free_dora_event(event); From 0ace8f052bee69149ed073b0fa680dd84af98e45 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 11:22:39 +0100 Subject: [PATCH 067/225] Skip clippy runs for runtime feature The runtime is currently commented out. --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da37b506..3ae03a65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,8 +106,10 @@ jobs: run: cargo clippy --all - name: "Clippy (tracing feature)" run: cargo clippy --all --features tracing + if: false # only the dora-runtime has this feature, but it is currently commented out - name: "Clippy (metrics feature)" run: cargo clippy --all --features metrics + if: false # only the dora-runtime has this feature, but it is currently commented out rustfmt: name: "Formatting" From 8c1c29f7f8658c4df5a78a636ad5ae0e9dbdc262 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 12:14:20 +0100 Subject: [PATCH 068/225] Remove icoryx roudi handling from dora-cli --- Cargo.lock | 29 ++------------------- binaries/cli/Cargo.toml | 1 - binaries/cli/src/check.rs | 15 ----------- binaries/cli/src/main.rs | 4 --- binaries/cli/src/up.rs | 54 +++++---------------------------------- 5 files changed, 8 insertions(+), 95 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad627541..b08f01c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -914,7 +914,6 @@ dependencies = [ "serde", "serde_json", "serde_yaml 0.9.11", - "sysinfo 0.26.6", "tempfile", "termcolor", "uuid 1.2.1", @@ -2253,15 +2252,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "ntapi" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc51db7b362b205941f71232e56c625156eb9a929f8cf74a428fd5bc094a4afc" -dependencies = [ - "winapi", -] - [[package]] name = "num-bigint-dig" version = "0.7.0" @@ -2496,7 +2486,7 @@ checksum = "a848fb2d43cc8e5adabdedc6b37a88b45653d3a23b000a3d047e6953d5af42ea" dependencies = [ "indexmap", "opentelemetry", - "sysinfo 0.24.5", + "sysinfo", ] [[package]] @@ -3705,22 +3695,7 @@ dependencies = [ "cfg-if", "core-foundation-sys", "libc", - "ntapi 0.3.7", - "once_cell", - "rayon", - "winapi", -] - -[[package]] -name = "sysinfo" -version = "0.26.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6d0dedf2e65d25b365c588382be9dc3a3ee4b0ed792366cf722d174c359d948" -dependencies = [ - "cfg-if", - "core-foundation-sys", - "libc", - "ntapi 0.4.0", + "ntapi", "once_cell", "rayon", "winapi", diff --git a/binaries/cli/Cargo.toml b/binaries/cli/Cargo.toml index 0c504b51..efc458e2 100644 --- a/binaries/cli/Cargo.toml +++ b/binaries/cli/Cargo.toml @@ -21,6 +21,5 @@ serde_json = "1.0.86" termcolor = "1.1.3" atty = "0.2.14" uuid = { version = "1.2.1", features = ["v4", "serde"] } -sysinfo = "0.26.6" inquire = "0.5.2" communication-layer-request-reply = { path = "../../libraries/communication-layer/request-reply" } diff --git a/binaries/cli/src/check.rs b/binaries/cli/src/check.rs index 64bf3e47..37a733e4 100644 --- a/binaries/cli/src/check.rs +++ b/binaries/cli/src/check.rs @@ -6,7 +6,6 @@ use dora_core::{ }; use eyre::{bail, eyre, Context}; use std::{env::consts::EXE_EXTENSION, io::Write, path::Path}; -use sysinfo::SystemExt; use termcolor::{Color, ColorChoice, ColorSpec, WriteColor}; pub fn check_environment() -> eyre::Result<()> { @@ -32,20 +31,6 @@ pub fn check_environment() -> eyre::Result<()> { } let _ = stdout.reset(); - // check whether roudi is running - write!(stdout, "Iceoryx Daemon: ")?; - let system = sysinfo::System::new_all(); - match system.processes_by_exact_name("iox-roudi").next() { - Some(_) => { - let _ = stdout.set_color(ColorSpec::new().set_fg(Some(Color::Green))); - writeln!(stdout, "ok")?; - } - None => { - let _ = stdout.set_color(ColorSpec::new().set_fg(Some(Color::Red))); - writeln!(stdout, "not running")?; - error_occured = true; - } - } writeln!(stdout)?; if error_occured { diff --git a/binaries/cli/src/main.rs b/binaries/cli/src/main.rs index 3b978f03..616dd288 100644 --- a/binaries/cli/src/main.rs +++ b/binaries/cli/src/main.rs @@ -48,8 +48,6 @@ enum Command { #[clap(long)] config: Option, #[clap(long)] - roudi_path: Option, - #[clap(long)] coordinator_path: Option, }, Destroy { @@ -126,11 +124,9 @@ fn main() -> eyre::Result<()> { Command::Dashboard => todo!(), Command::Up { config, - roudi_path, coordinator_path, } => up::up( config.as_deref(), - roudi_path.as_deref(), coordinator_path.as_deref(), )?, Command::Start { dataflow, name } => start_dataflow(dataflow, name, &mut session)?, diff --git a/binaries/cli/src/up.rs b/binaries/cli/src/up.rs index 6628b664..9e0c4c62 100644 --- a/binaries/cli/src/up.rs +++ b/binaries/cli/src/up.rs @@ -1,37 +1,22 @@ use crate::{check::coordinator_running, control_connection}; use communication_layer_request_reply::TcpRequestReplyConnection; use dora_core::topics::ControlRequest; -use eyre::{bail, Context}; -use std::{fs, path::Path, process::Command}; -use sysinfo::{ProcessExt, SystemExt}; +use eyre::Context; +use std::{fs, path::Path, process::Command, time::Duration}; -#[derive(Debug, serde::Serialize, serde::Deserialize)] -struct UpConfig { - iceoryx: bool, -} - -impl Default for UpConfig { - fn default() -> Self { - Self { iceoryx: true } - } -} +#[derive(Debug, Default, serde::Serialize, serde::Deserialize)] +struct UpConfig {} pub(crate) fn up( config_path: Option<&Path>, - roudi: Option<&Path>, coordinator: Option<&Path>, ) -> eyre::Result<()> { - let UpConfig { iceoryx } = parse_dora_config(config_path)?; + let UpConfig {} = parse_dora_config(config_path)?; if !coordinator_running()? { start_coordinator(coordinator).wrap_err("failed to start dora-coordinator")?; } - if iceoryx { - // try to start roudi - start_roudi(roudi).wrap_err("failed to start iceoryx roudi daemon")?; - } - Ok(()) } @@ -39,7 +24,7 @@ pub(crate) fn destroy( config_path: Option<&Path>, session: &mut Option>, ) -> Result<(), eyre::ErrReport> { - let UpConfig { iceoryx } = parse_dora_config(config_path)?; + let UpConfig {} = parse_dora_config(config_path)?; if coordinator_running()? { // send destroy command to dora-coordinator @@ -51,25 +36,6 @@ pub(crate) fn destroy( eprintln!("The dora-coordinator is not running"); } - if iceoryx { - // kill iox-roudi process - let system = sysinfo::System::new_all(); - let processes: Vec<_> = system.processes_by_exact_name("iox-roudi").collect(); - if processes.is_empty() { - eprintln!("No `iox-roudi` process found"); - } else if processes.len() == 1 { - let process = processes[0]; - let success = process.kill(); - if success { - println!("Killed `iox-roudi` process"); - } else { - bail!("failed to kill iox-roudi process"); - } - } else { - bail!("multiple iox-roudi processes found, please kill the correct processes manually"); - } - } - Ok(()) } @@ -99,14 +65,6 @@ fn start_coordinator(coordinator: Option<&Path>) -> eyre::Result<()> { Ok(()) } -fn start_roudi(roudi: Option<&Path>) -> eyre::Result<()> { - let roudi = roudi.unwrap_or_else(|| Path::new("iox-roudi")); - let mut cmd = Command::new(roudi); - cmd.spawn() - .wrap_err_with(|| format!("failed to run {}", roudi.display()))?; - println!("started iox-roudi daemon"); - Ok(()) -} From 7448594622d1aa6c41c017cc2bf74e91dc0cf196 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 12:16:43 +0100 Subject: [PATCH 069/225] Update `dora check` to check whether daemon is running --- binaries/cli/src/check.rs | 33 ++++++++++++++++++++++++++++++++- binaries/coordinator/src/lib.rs | 4 ++++ libraries/core/src/topics.rs | 1 + 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/binaries/cli/src/check.rs b/binaries/cli/src/check.rs index 37a733e4..005c517d 100644 --- a/binaries/cli/src/check.rs +++ b/binaries/cli/src/check.rs @@ -3,6 +3,7 @@ use dora_core::{ adjust_shared_library_path, config::{InputMapping, UserInputMapping}, descriptor::{self, source_is_url, CoreNodeKind, OperatorSource}, + topics::ControlRequest, }; use eyre::{bail, eyre, Context}; use std::{env::consts::EXE_EXTENSION, io::Write, path::Path}; @@ -19,7 +20,6 @@ pub fn check_environment() -> eyre::Result<()> { let mut stdout = termcolor::StandardStream::stdout(color_choice); // check whether coordinator is running - write!(stdout, "Dora Coordinator: ")?; if coordinator_running()? { let _ = stdout.set_color(ColorSpec::new().set_fg(Some(Color::Green))); @@ -31,6 +31,18 @@ pub fn check_environment() -> eyre::Result<()> { } let _ = stdout.reset(); + // check whether daemon is running + write!(stdout, "Dora Daemon: ")?; + if daemon_running()? { + let _ = stdout.set_color(ColorSpec::new().set_fg(Some(Color::Green))); + writeln!(stdout, "ok")?; + } else { + let _ = stdout.set_color(ColorSpec::new().set_fg(Some(Color::Red))); + writeln!(stdout, "not running")?; + error_occured = true; + } + let _ = stdout.reset(); + writeln!(stdout)?; if error_occured { @@ -46,6 +58,25 @@ pub fn coordinator_running() -> Result { Ok(connected) } +pub fn daemon_running() -> Result { + let mut control_session = None; + let running = match control_connection(&mut control_session) { + Ok(connection) => { + let reply_raw = connection + .request(&serde_json::to_vec(&ControlRequest::DaemonConnected).unwrap()) + .wrap_err("failed to send DaemonConnected message")?; + + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")? + } + Err(_) => { + // coordinator is not running + false + } + }; + + Ok(running) +} + pub fn check_dataflow(dataflow_path: &Path, runtime: Option<&Path>) -> eyre::Result<()> { let descriptor = read_descriptor(dataflow_path).wrap_err_with(|| { format!( diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index bfbef430..af0987f5 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -253,6 +253,10 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { serde_json::to_vec(&reply).unwrap() } + ControlRequest::DaemonConnected => { + let running = !daemon_connections.is_empty(); + serde_json::to_vec(&running).unwrap() + } }; let _ = reply_sender.send(reply); } diff --git a/libraries/core/src/topics.rs b/libraries/core/src/topics.rs index 23463264..524f6299 100644 --- a/libraries/core/src/topics.rs +++ b/libraries/core/src/topics.rs @@ -27,6 +27,7 @@ pub enum ControlRequest { }, Destroy, List, + DaemonConnected, } #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] From 35e86684d04a57a24abff3ee2cb03d0e4dc2a438 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 12:18:09 +0100 Subject: [PATCH 070/225] Update `up` and `destroy` commands for dora-daemon and ensure proper coordinator exit --- binaries/cli/src/main.rs | 4 ++ binaries/cli/src/up.rs | 21 +++++++++- binaries/coordinator/src/lib.rs | 56 +++++++++++++++++++++++---- binaries/daemon/src/lib.rs | 49 +++++++++++++++-------- libraries/core/src/daemon_messages.rs | 2 + 5 files changed, 107 insertions(+), 25 deletions(-) diff --git a/binaries/cli/src/main.rs b/binaries/cli/src/main.rs index 616dd288..dec4aeca 100644 --- a/binaries/cli/src/main.rs +++ b/binaries/cli/src/main.rs @@ -49,6 +49,8 @@ enum Command { config: Option, #[clap(long)] coordinator_path: Option, + #[clap(long)] + daemon_path: Option, }, Destroy { #[clap(long)] @@ -125,9 +127,11 @@ fn main() -> eyre::Result<()> { Command::Up { config, coordinator_path, + daemon_path, } => up::up( config.as_deref(), coordinator_path.as_deref(), + daemon_path.as_deref(), )?, Command::Start { dataflow, name } => start_dataflow(dataflow, name, &mut session)?, Command::List => list(&mut session)?, diff --git a/binaries/cli/src/up.rs b/binaries/cli/src/up.rs index 9e0c4c62..54cd6b5f 100644 --- a/binaries/cli/src/up.rs +++ b/binaries/cli/src/up.rs @@ -1,4 +1,7 @@ -use crate::{check::coordinator_running, control_connection}; +use crate::{ + check::{coordinator_running, daemon_running}, + control_connection, +}; use communication_layer_request_reply::TcpRequestReplyConnection; use dora_core::topics::ControlRequest; use eyre::Context; @@ -10,11 +13,19 @@ struct UpConfig {} pub(crate) fn up( config_path: Option<&Path>, coordinator: Option<&Path>, + daemon: Option<&Path>, ) -> eyre::Result<()> { let UpConfig {} = parse_dora_config(config_path)?; if !coordinator_running()? { start_coordinator(coordinator).wrap_err("failed to start dora-coordinator")?; + // sleep a bit until the coordinator accepts connections + while !coordinator_running()? { + std::thread::sleep(Duration::from_millis(50)); + } + } + if !daemon_running()? { + start_daemon(daemon).wrap_err("failed to start dora-daemon")?; } Ok(()) @@ -65,6 +76,14 @@ fn start_coordinator(coordinator: Option<&Path>) -> eyre::Result<()> { Ok(()) } +fn start_daemon(daemon: Option<&Path>) -> eyre::Result<()> { + let daemon = daemon.unwrap_or_else(|| Path::new("dora-daemon")); + let mut cmd = Command::new(daemon); + cmd.spawn() + .wrap_err_with(|| format!("failed to run {}", daemon.display()))?; + println!("started dora daemon"); + Ok(()) +} diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index af0987f5..b54d71f5 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -55,13 +55,15 @@ pub async fn run(args: Args) -> eyre::Result<()> { async fn start(runtime_path: &Path) -> eyre::Result<()> { let listener = listener::create_listener(DORA_COORDINATOR_PORT_DEFAULT).await?; - let new_daemon_connections = TcpListenerStream::new(listener).map(|c| { - c.map(Event::NewDaemonConnection) - .wrap_err("failed to open connection") - .unwrap_or_else(Event::DaemonConnectError) - }); + let (new_daemon_connections, new_daemon_connections_abort) = + futures::stream::abortable(TcpListenerStream::new(listener).map(|c| { + c.map(Event::NewDaemonConnection) + .wrap_err("failed to open connection") + .unwrap_or_else(Event::DaemonConnectError) + })); let (daemon_events_tx, daemon_events) = tokio::sync::mpsc::channel(2); + let mut daemon_events_tx = Some(daemon_events_tx); let daemon_events = ReceiverStream::new(daemon_events); let (control_events, control_events_abort) = futures::stream::abortable( @@ -80,7 +82,13 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { match event { Event::NewDaemonConnection(connection) => { let events_tx = daemon_events_tx.clone(); - tokio::spawn(listener::handle_connection(connection, events_tx)); + if let Some(events_tx) = events_tx { + tokio::spawn(listener::handle_connection(connection, events_tx)); + } else { + tracing::warn!( + "ignoring new daemon connection because events_tx was closed already" + ); + } } Event::DaemonConnectError(err) => { tracing::warn!("{:?}", err.wrap_err("failed to connect to dora-daemon")); @@ -235,6 +243,13 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .await?; } + // destroy all connected daemons + destroy_daemons(&mut daemon_connections).await?; + + // prevent the creation of new daemon connections + new_daemon_connections_abort.abort(); + daemon_events_tx = None; + b"ok".as_slice().into() } ControlRequest::List => { @@ -317,7 +332,7 @@ async fn stop_dataflow( _ => bail!("unexpected reply"), } } - tracing::info!("successfully stoped dataflow `{uuid}`"); + tracing::info!("successfully stopped dataflow `{uuid}`"); Ok(()) } @@ -343,6 +358,33 @@ async fn start_dataflow( }) } +async fn destroy_daemons(daemon_connections: &mut HashMap) -> eyre::Result<()> { + let message = serde_json::to_vec(&DaemonCoordinatorEvent::Destroy)?; + + for (machine_id, mut daemon_connection) in daemon_connections.drain() { + tcp_send(&mut daemon_connection, &message) + .await + .wrap_err("failed to send destroy message to daemon")?; + + // wait for reply + let reply_raw = tcp_receive(&mut daemon_connection) + .await + .wrap_err("failed to receive destroy reply from daemon")?; + match serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize destroy reply from daemon")? + { + DaemonCoordinatorReply::DestroyResult(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to destroy dataflow")?, + _ => bail!("unexpected reply"), + } + + tracing::info!("successfully destroyed daemon `{machine_id}`"); + } + + Ok(()) +} + #[derive(Debug)] pub enum Event { NewDaemonConnection(TcpStream), diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 39582fac..ec89b7a3 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -118,7 +118,7 @@ impl Daemon { .and_then(|r| async { match r { DaemonCoordinatorReply::SpawnResult(result) => result.map_err(|err| eyre!(err)), - DaemonCoordinatorReply::StopResult(_) => Err(eyre!("unexpected spawn reply")), + _ => Err(eyre!("unexpected spawn reply")), } }); @@ -180,10 +180,12 @@ impl Daemon { tracing::warn!("{:?}", err.wrap_err("failed to connect")); } Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { - let result = self.handle_coordinator_event(event).await; - let _ = reply_tx.send(DaemonCoordinatorReply::SpawnResult( - result.map_err(|err| format!("{err:?}")), - )); + let (reply, status) = self.handle_coordinator_event(event).await; + let _ = reply_tx.send(reply); + match status { + RunStatus::Continue => {} + RunStatus::Exit => break, + } } Event::Node { dataflow_id: dataflow, @@ -219,22 +221,35 @@ impl Daemon { async fn handle_coordinator_event( &mut self, event: DaemonCoordinatorEvent, - ) -> eyre::Result<()> { + ) -> (DaemonCoordinatorReply, RunStatus) { match event { DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { - self.spawn_dataflow(dataflow_id, nodes).await + let result = self.spawn_dataflow(dataflow_id, nodes).await; + let reply = + DaemonCoordinatorReply::SpawnResult(result.map_err(|err| format!("{err:?}"))); + (reply, RunStatus::Continue) } DaemonCoordinatorEvent::StopDataflow { dataflow_id } => { - let dataflow = self - .running - .get_mut(&dataflow_id) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - - for channel in dataflow.subscribe_channels.values_mut() { - let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; - } - - Ok(()) + let stop = async { + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + + for channel in dataflow.subscribe_channels.values_mut() { + let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; + } + Result::<(), eyre::Report>::Ok(()) + }; + let reply = DaemonCoordinatorReply::SpawnResult( + stop.await.map_err(|err| format!("{err:?}")), + ); + (reply, RunStatus::Continue) + } + DaemonCoordinatorEvent::Destroy => { + tracing::info!("received destroy command -> exiting"); + let reply = DaemonCoordinatorReply::DestroyResult(Ok(())); + (reply, RunStatus::Exit) } } } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 2d00e5bc..74f471ef 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -84,12 +84,14 @@ pub struct InputData { pub enum DaemonCoordinatorEvent { Spawn(SpawnDataflowNodes), StopDataflow { dataflow_id: DataflowId }, + Destroy, } #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorReply { SpawnResult(Result<(), String>), StopResult(Result<(), String>), + DestroyResult(Result<(), String>), } pub type DataflowId = Uuid; From 2b7d2508f4d2a892adb47720b411e2f57bdbbbcd Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 15:05:50 +0100 Subject: [PATCH 071/225] Implement watchdog messages for detecting sudden disconnects of daemon --- binaries/coordinator/src/lib.rs | 68 ++++++++++++++++++++++++++- binaries/daemon/src/lib.rs | 3 ++ libraries/core/src/daemon_messages.rs | 2 + 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index b54d71f5..5716931f 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -19,6 +19,7 @@ use run::SpawnedDataflow; use std::{ collections::{BTreeSet, HashMap}, path::{Path, PathBuf}, + time::Duration, }; use tokio::net::TcpStream; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; @@ -72,13 +73,25 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .wrap_err("failed to create control events")?, ); - let mut events = (new_daemon_connections, daemon_events, control_events).merge(); + let daemon_watchdog_interval = + tokio_stream::wrappers::IntervalStream::new(tokio::time::interval(Duration::from_secs(1))) + .map(|_| Event::DaemonWatchdogInterval); + + let mut events = ( + new_daemon_connections, + daemon_events, + control_events, + daemon_watchdog_interval, + ) + .merge(); let mut running_dataflows: HashMap = HashMap::new(); let mut daemon_connections: HashMap<_, TcpStream> = HashMap::new(); while let Some(event) = events.next().await { - tracing::trace!("Handling event {event:?}"); + if event.log() { + tracing::trace!("Handling event {event:?}"); + } match event { Event::NewDaemonConnection(connection) => { let events_tx = daemon_events_tx.clone(); @@ -277,6 +290,28 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { } ControlEvent::Error(err) => tracing::error!("{err:?}"), }, + Event::DaemonWatchdogInterval => { + let mut disconnected = BTreeSet::new(); + for (machine_id, connection) in &mut daemon_connections { + let result: eyre::Result<()> = + tokio::time::timeout(Duration::from_millis(100), send_watchdog_message(connection)) + .await + .wrap_err("timeout") + .and_then(|r| r).wrap_err_with(|| + format!("daemon at `{machine_id}` did not react as expected to watchdog message"), + ); + if let Err(err) = result { + tracing::warn!("{err:?}"); + disconnected.insert(machine_id.clone()); + } + } + if !disconnected.is_empty() { + tracing::info!("Disconnecting daemons that failed watchdog: {disconnected:?}"); + for machine_id in disconnected { + daemon_connections.remove(&machine_id); + } + } + } } } @@ -285,6 +320,24 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { Ok(()) } +async fn send_watchdog_message(connection: &mut TcpStream) -> eyre::Result<()> { + let message = serde_json::to_vec(&DaemonCoordinatorEvent::Watchdog).unwrap(); + + tcp_send(connection, &message) + .await + .wrap_err("failed to send watchdog message to daemon")?; + let reply_raw = tcp_receive(connection) + .await + .wrap_err("failed to receive stop reply from daemon")?; + + match serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize stop reply from daemon")? + { + DaemonCoordinatorReply::WatchdogAck => Ok(()), + _ => bail!("unexpected reply"), + } +} + struct RunningDataflow { name: Option, uuid: Uuid, @@ -392,6 +445,17 @@ pub enum Event { Dataflow { uuid: Uuid, event: DataflowEvent }, Control(ControlEvent), Daemon(DaemonEvent), + DaemonWatchdogInterval, +} +impl Event { + /// Whether this event should be logged. + #[allow(clippy::match_like_matches_macro)] + pub fn log(&self) -> bool { + match self { + Event::DaemonWatchdogInterval => false, + _ => true, + } + } } #[derive(Debug)] diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index ec89b7a3..f6554cb4 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -251,6 +251,9 @@ impl Daemon { let reply = DaemonCoordinatorReply::DestroyResult(Ok(())); (reply, RunStatus::Exit) } + DaemonCoordinatorEvent::Watchdog => { + (DaemonCoordinatorReply::WatchdogAck, RunStatus::Continue) + } } } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 74f471ef..6842d71a 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -85,6 +85,7 @@ pub enum DaemonCoordinatorEvent { Spawn(SpawnDataflowNodes), StopDataflow { dataflow_id: DataflowId }, Destroy, + Watchdog, } #[derive(Debug, serde::Deserialize, serde::Serialize)] @@ -92,6 +93,7 @@ pub enum DaemonCoordinatorReply { SpawnResult(Result<(), String>), StopResult(Result<(), String>), DestroyResult(Result<(), String>), + WatchdogAck, } pub type DataflowId = Uuid; From 218106721d9a29256598aaf0c05bc92963aec4f2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 15:18:58 +0100 Subject: [PATCH 072/225] Add watchdog for checking that coordinator is still reachable Throw an error if a daemon cannot reach the coordinator anymore. This is only an interim solution. In the future, we want to make the daemon more robust and ideally even allow restarts of the coordinator. --- binaries/coordinator/src/listener.rs | 15 +++++++++-- binaries/daemon/src/coordinator.rs | 6 +++-- binaries/daemon/src/lib.rs | 31 +++++++++++++++++++++- libraries/core/src/coordinator_messages.rs | 4 +++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/binaries/coordinator/src/listener.rs b/binaries/coordinator/src/listener.rs index a7cd938b..15c11ad7 100644 --- a/binaries/coordinator/src/listener.rs +++ b/binaries/coordinator/src/listener.rs @@ -1,7 +1,10 @@ -use crate::{tcp_utils::tcp_receive, DaemonEvent, DataflowEvent, Event}; +use crate::{ + tcp_utils::{tcp_receive, tcp_send}, + DaemonEvent, DataflowEvent, Event, +}; use dora_core::coordinator_messages; use eyre::{eyre, Context}; -use std::{io::ErrorKind, net::Ipv4Addr}; +use std::{io::ErrorKind, net::Ipv4Addr, time::Duration}; use tokio::{ net::{TcpListener, TcpStream}, sync::mpsc, @@ -66,6 +69,14 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende break; } } + coordinator_messages::DaemonEvent::Watchdog => { + let reply = serde_json::to_vec(&coordinator_messages::WatchdogAck).unwrap(); + _ = tokio::time::timeout( + Duration::from_millis(10), + tcp_send(&mut connection, &reply), + ) + .await; + } }, }; } diff --git a/binaries/daemon/src/coordinator.rs b/binaries/daemon/src/coordinator.rs index 99477123..69973582 100644 --- a/binaries/daemon/src/coordinator.rs +++ b/binaries/daemon/src/coordinator.rs @@ -98,7 +98,7 @@ pub async fn send_event( addr: SocketAddr, machine_id: String, event: DaemonEvent, -) -> eyre::Result<()> { +) -> eyre::Result { let mut stream = TcpStream::connect(addr) .await .wrap_err("failed to connect to dora-coordinator")?; @@ -108,5 +108,7 @@ pub async fn send_event( let msg = serde_json::to_vec(&CoordinatorRequest::Event { machine_id, event })?; tcp_send(&mut stream, &msg) .await - .wrap_err("failed to send event to dora-coordinator") + .wrap_err("failed to send event to dora-coordinator")?; + + Ok(stream) } diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index f6554cb4..b2303ecc 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -20,6 +20,7 @@ use std::{ rc::Rc, time::Duration, }; +use tcp_utils::tcp_receive; use tokio::{ fs, net::TcpStream, @@ -157,7 +158,17 @@ impl Daemon { exit_when_done, }; let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); - let events = (external_events, new_connections, dora_events).merge(); + let watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval( + Duration::from_secs(5), + )) + .map(|_| Event::WatchdogInterval); + let events = ( + external_events, + new_connections, + dora_events, + watchdog_interval, + ) + .merge(); daemon.run_inner(events).await } @@ -212,6 +223,23 @@ impl Daemon { None => tracing::warn!("received unknown drop token {token:?}"), } } + Event::WatchdogInterval => { + if let Some(addr) = self.coordinator_addr { + let mut connection = coordinator::send_event( + addr, + self.machine_id.clone(), + DaemonEvent::Watchdog, + ) + .await + .wrap_err("lost connection to coordinator")?; + let reply_raw = tcp_receive(&mut connection) + .await + .wrap_err("lost connection to coordinator")?; + let _: dora_core::coordinator_messages::WatchdogAck = + serde_json::from_slice(&reply_raw) + .wrap_err("received unexpected watchdog reply from coordinator")?; + } + } } } @@ -620,6 +648,7 @@ pub enum Event { Coordinator(CoordinatorEvent), Dora(DoraEvent), Drop(DropEvent), + WatchdogInterval, } #[derive(Debug)] diff --git a/libraries/core/src/coordinator_messages.rs b/libraries/core/src/coordinator_messages.rs index 48618b3c..382fe757 100644 --- a/libraries/core/src/coordinator_messages.rs +++ b/libraries/core/src/coordinator_messages.rs @@ -19,6 +19,7 @@ pub enum DaemonEvent { dataflow_id: DataflowId, result: Result<(), String>, }, + Watchdog, } #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -35,3 +36,6 @@ impl RegisterResult { } } } + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct WatchdogAck; From 19214c482912c3dba7d308408524b21884213163 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 29 Dec 2022 17:33:06 +0100 Subject: [PATCH 073/225] Add a basic benchmark to test throughput and latency for different message sizes --- .github/workflows/ci.yml | 5 +++ Cargo.lock | 19 ++++++++ Cargo.toml | 5 +++ examples/benchmark/dataflow.yml | 18 ++++++++ examples/benchmark/node/Cargo.toml | 13 ++++++ examples/benchmark/node/src/main.rs | 21 +++++++++ examples/benchmark/run.rs | 43 ++++++++++++++++++ examples/benchmark/sink/Cargo.toml | 10 +++++ examples/benchmark/sink/src/main.rs | 68 +++++++++++++++++++++++++++++ 9 files changed, 202 insertions(+) create mode 100644 examples/benchmark/dataflow.yml create mode 100644 examples/benchmark/node/Cargo.toml create mode 100644 examples/benchmark/node/src/main.rs create mode 100644 examples/benchmark/run.rs create mode 100644 examples/benchmark/sink/Cargo.toml create mode 100644 examples/benchmark/sink/src/main.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3ae03a65..3b5d13dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,6 +60,11 @@ jobs: - name: "Rust Dataflow example" timeout-minutes: 30 run: cargo run --example rust-dataflow + + - name: "Benchmark example" + timeout-minutes: 30 + run: cargo run --example benchmark --release + - name: "C Dataflow example" timeout-minutes: 15 run: cargo run --example c-dataflow diff --git a/Cargo.lock b/Cargo.lock index b08f01c5..dc121df2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -272,6 +272,25 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6b4d9b1225d28d360ec6a231d65af1fd99a2a095154c8040689617290569c5c" +[[package]] +name = "benchmark-example-node" +version = "0.1.2" +dependencies = [ + "dora-node-api", + "eyre", + "futures", + "rand", + "tokio", +] + +[[package]] +name = "benchmark-example-sink" +version = "0.1.2" +dependencies = [ + "dora-node-api", + "eyre", +] + [[package]] name = "bincode" version = "1.3.3" diff --git a/Cargo.toml b/Cargo.toml index 2de4e586..595fdc34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "binaries/daemon", # "binaries/runtime", "examples/rust-dataflow/*", + "examples/benchmark/*", "libraries/communication-layer/*", "libraries/core", "libraries/message", @@ -65,3 +66,7 @@ path = "examples/c++-dataflow/run.rs" [[example]] name = "python-dataflow" path = "examples/python-dataflow/run.rs" + +[[example]] +name = "benchmark" +path = "examples/benchmark/run.rs" diff --git a/examples/benchmark/dataflow.yml b/examples/benchmark/dataflow.yml new file mode 100644 index 00000000..87452f6c --- /dev/null +++ b/examples/benchmark/dataflow.yml @@ -0,0 +1,18 @@ +communication: + zenoh: + prefix: /benchmark-example + +nodes: + - id: rust-node + custom: + build: cargo build -p benchmark-example-node --release + source: ../../target/release/benchmark-example-node + outputs: + - random + - id: rust-sink + custom: + build: cargo build -p benchmark-example-sink --release + source: ../../target/release/benchmark-example-sink + inputs: + # message: runtime-node/rust-operator/status + message: rust-node/random diff --git a/examples/benchmark/node/Cargo.toml b/examples/benchmark/node/Cargo.toml new file mode 100644 index 00000000..35e582a7 --- /dev/null +++ b/examples/benchmark/node/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "benchmark-example-node" +version.workspace = true +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dora-node-api = { workspace = true } +eyre = "0.6.8" +futures = "0.3.21" +rand = "0.8.5" +tokio = { version = "1.20.1", features = ["rt", "macros"] } diff --git a/examples/benchmark/node/src/main.rs b/examples/benchmark/node/src/main.rs new file mode 100644 index 00000000..53644f8b --- /dev/null +++ b/examples/benchmark/node/src/main.rs @@ -0,0 +1,21 @@ +use dora_node_api::{self, dora_core::config::DataId, DoraNode}; +use rand::Rng; + +fn main() -> eyre::Result<()> { + let output = DataId::from("random".to_owned()); + + let (mut node, _events) = DoraNode::init_from_env()?; + for size in [0, 8, 64, 512, 2048, 4096, 4 * 4096, 10 * 4096] { + for _ in 0..100 { + let data: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(size) + .collect(); + node.send_output(output.clone(), Default::default(), data.len(), |out| { + out.copy_from_slice(&data); + })?; + } + } + + Ok(()) +} diff --git a/examples/benchmark/run.rs b/examples/benchmark/run.rs new file mode 100644 index 00000000..37f26473 --- /dev/null +++ b/examples/benchmark/run.rs @@ -0,0 +1,43 @@ +use eyre::{bail, Context}; +use std::path::Path; +use tracing::metadata::LevelFilter; +use tracing_subscriber::Layer; + +#[tokio::main] +async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + + let root = Path::new(env!("CARGO_MANIFEST_DIR")); + std::env::set_current_dir(root.join(file!()).parent().unwrap()) + .wrap_err("failed to set working dir")?; + + let dataflow = Path::new("dataflow.yml"); + build_dataflow(dataflow).await?; + + dora_daemon::Daemon::run_dataflow(dataflow).await?; + + Ok(()) +} + +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(LevelFilter::DEBUG); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} diff --git a/examples/benchmark/sink/Cargo.toml b/examples/benchmark/sink/Cargo.toml new file mode 100644 index 00000000..58545c97 --- /dev/null +++ b/examples/benchmark/sink/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "benchmark-example-sink" +version.workspace = true +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +dora-node-api = { workspace = true } +eyre = "0.6.8" diff --git a/examples/benchmark/sink/src/main.rs b/examples/benchmark/sink/src/main.rs new file mode 100644 index 00000000..5ad29240 --- /dev/null +++ b/examples/benchmark/sink/src/main.rs @@ -0,0 +1,68 @@ +use dora_node_api::{self, daemon::Event, DoraNode}; +use eyre::ContextCompat; +use std::{ + fmt::Write as _, + time::{Duration, Instant}, +}; + +fn main() -> eyre::Result<()> { + let (_node, mut events) = DoraNode::init_from_env()?; + + let mut current_size = 0; + let mut n = 0; + let mut start = Instant::now(); + let mut latencies = Vec::new(); + + let mut summary = String::new(); + + while let Some(event) = events.recv() { + match event { + Event::Stop => break, + Event::Input { id, metadata, data } => match id.as_str() { + "message" => { + let data = data.as_deref().unwrap_or_default(); + + if data.len() != current_size { + if n > 0 { + record_results(start, current_size, n, latencies, &mut summary); + } + current_size = data.len(); + n = 0; + start = Instant::now(); + latencies = Vec::new(); + } + n += 1; + latencies.push(metadata.timestamp().get_time().to_system_time().elapsed()?); + } + other => eprintln!("Ignoring unexpected input `{other}`"), + }, + Event::InputClosed { id } => { + println!("Input `{id}` was closed -> exiting"); + break; + } + other => eprintln!("Received unexpected input: {other:?}"), + } + } + + record_results(start, current_size, n, latencies, &mut summary); + + println!("\nSummary:\n{summary}"); + + Ok(()) +} + +fn record_results( + start: Instant, + current_size: usize, + n: u32, + latencies: Vec, + summary: &mut String, +) { + let duration = start.elapsed(); + let per_message = duration / n; + let avg_latency = latencies.iter().sum::() / n; + let msg = + format!("size {current_size:<#8x}: {per_message:?} per message (latency: {avg_latency:?})"); + println!("{msg}"); + writeln!(summary, "{msg}").unwrap(); +} From de4b0488531592216d45094e93a37e29176f920f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 2 Jan 2023 17:19:13 +0100 Subject: [PATCH 074/225] Fix performance: Set `nodelay` on listener side too Without this, the TCP connection uses Nagle's algorithm, which delays sending of packets until ACKs are received. This is problematic because most OSs deliberately delay ACKs to reduce overhead. For example, the default ACK delay on Linux is 40ms, which leads to 80ms latency when sending an output. --- binaries/coordinator/src/lib.rs | 1 + binaries/daemon/src/lib.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 5716931f..db87a046 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -94,6 +94,7 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { } match event { Event::NewDaemonConnection(connection) => { + connection.set_nodelay(true)?; let events_tx = daemon_events_tx.clone(); if let Some(events_tx) = events_tx { tokio::spawn(listener::handle_connection(connection, events_tx)); diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index b2303ecc..42f582a8 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -184,6 +184,7 @@ impl Daemon { while let Some(event) = events.next().await { match event { Event::NewConnection(connection) => { + connection.set_nodelay(true)?; let events_tx = node_events_tx.clone(); tokio::spawn(listener::handle_connection(connection, events_tx)); } From aa85b6ee4183a9381e299d6ca962878e883fb446 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 2 Jan 2023 17:21:21 +0100 Subject: [PATCH 075/225] Increase benchmark messages sizes (up to 10000 4k pages) --- examples/benchmark/node/src/main.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/benchmark/node/src/main.rs b/examples/benchmark/node/src/main.rs index 53644f8b..14b63c7d 100644 --- a/examples/benchmark/node/src/main.rs +++ b/examples/benchmark/node/src/main.rs @@ -5,7 +5,20 @@ fn main() -> eyre::Result<()> { let output = DataId::from("random".to_owned()); let (mut node, _events) = DoraNode::init_from_env()?; - for size in [0, 8, 64, 512, 2048, 4096, 4 * 4096, 10 * 4096] { + let sizes = [ + 0, + 8, + 64, + 512, + 2048, + 4096, + 4 * 4096, + 10 * 4096, + 100 * 4096, + 1000 * 4096, + 10000 * 4096, + ]; + for size in sizes { for _ in 0..100 { let data: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) From 27daceafd1a72da0ffcd48548e7b21889955515f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 2 Jan 2023 17:38:13 +0100 Subject: [PATCH 076/225] Fix typo Co-authored-by: Haixuan Xavier Tao --- apis/python/node/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 617cc220..6d45bbf5 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -42,7 +42,7 @@ impl IntoPy for PyInput<'_> { } Event::InputClosed { id } => { dict.set_item("id", id.to_string()) - .wrap_err("failed to add clsoed-input ID") + .wrap_err("failed to add closed-input ID") .unwrap(); "input-closed" } From 5666895a7c5d6445f6577795466e9506fef52fba Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 2 Jan 2023 18:24:02 +0100 Subject: [PATCH 077/225] Close event channel when last input is closed or when stop is received --- binaries/daemon/src/lib.rs | 35 +++++++++++++++----- examples/python-dataflow/no_webcam.py | 2 -- examples/python-dataflow/object_detection.py | 2 -- examples/python-dataflow/plot.py | 2 -- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 42f582a8..7ac18f08 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -265,7 +265,7 @@ impl Daemon { .get_mut(&dataflow_id) .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - for channel in dataflow.subscribe_channels.values_mut() { + for (_node_id, channel) in dataflow.subscribe_channels.drain() { let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; } Result::<(), eyre::Report>::Ok(()) @@ -298,7 +298,13 @@ impl Daemon { } }; for (node_id, params) in nodes { + dataflow.running_nodes.insert(node_id.clone()); for (input_id, mapping) in params.node.run_config.inputs.clone() { + dataflow + .open_inputs + .entry(node_id.clone()) + .or_default() + .insert(input_id.clone()); match mapping { InputMapping::User(mapping) => { if mapping.operator.is_some() { @@ -366,7 +372,9 @@ impl Daemon { dataflow.subscribe_channels.insert(node_id, event_sender); Ok(()) } - None => Err(format!("no running dataflow with ID `{dataflow_id}`")), + None => Err(format!( + "subscribe failed: no running dataflow with ID `{dataflow_id}`" + )), }; let _ = reply_sender.send(ControlReply::Result(result)); } @@ -416,10 +424,9 @@ impl Daemon { } = message; let data = data.map(|(m, len)| (Rc::new(m), len)); - let dataflow = self - .running - .get_mut(&dataflow_id) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + let dataflow = self.running.get_mut(&dataflow_id).wrap_err_with(|| { + format!("send out failed: no running dataflow with ID `{dataflow_id}`") + })?; // figure out receivers from dataflow graph let empty_set = BTreeSet::new(); @@ -482,7 +489,7 @@ impl Daemon { let dataflow = self .running .get_mut(&dataflow_id) - .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; + .wrap_err_with(|| format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`"))?; let downstream_nodes: BTreeSet<_> = dataflow .mappings .iter() @@ -499,12 +506,20 @@ impl Daemon { id: input_id.clone(), }) .await; + + if let Some(open_inputs) = dataflow.open_inputs.get_mut(receiver_id) { + open_inputs.remove(input_id); + if open_inputs.is_empty() { + // close the subscriber channel + dataflow.subscribe_channels.remove(receiver_id); + } + } } // TODO: notify remote nodes - dataflow.subscribe_channels.remove(&node_id); - if dataflow.subscribe_channels.is_empty() { + dataflow.running_nodes.remove(&node_id); + if dataflow.running_nodes.is_empty() { tracing::info!( "Dataflow `{dataflow_id}` finished on machine `{}`", self.machine_id @@ -629,6 +644,8 @@ pub struct RunningDataflow { subscribe_channels: HashMap>, mappings: HashMap>, timers: BTreeMap>, + open_inputs: BTreeMap>, + running_nodes: BTreeSet, /// Keep handles to all timer tasks of this dataflow to cancel them on drop. _timer_handles: Vec>, } diff --git a/examples/python-dataflow/no_webcam.py b/examples/python-dataflow/no_webcam.py index 68de6f9a..40acf8bb 100755 --- a/examples/python-dataflow/no_webcam.py +++ b/examples/python-dataflow/no_webcam.py @@ -24,9 +24,7 @@ while time.time() - start < 20: node.send_output("image", arr.tobytes()) case "stop": print("received stop") - break case other: print("received unexpected event:", other) - break time.sleep(1) diff --git a/examples/python-dataflow/object_detection.py b/examples/python-dataflow/object_detection.py index ef33e946..b788cf75 100755 --- a/examples/python-dataflow/object_detection.py +++ b/examples/python-dataflow/object_detection.py @@ -30,7 +30,5 @@ for event in node: print("ignoring unexpected input:", other) case "stop": print("received stop") - break case other: print("received unexpected event:", other) - break diff --git a/examples/python-dataflow/plot.py b/examples/python-dataflow/plot.py index e3107702..67a9b3db 100755 --- a/examples/python-dataflow/plot.py +++ b/examples/python-dataflow/plot.py @@ -102,7 +102,5 @@ for event in node: break case "stop": print("received stop") - break case other: print("received unexpected event:", other) - break From 7888b7d0e464d494b64095d7ba3c18408d70de43 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 2 Jan 2023 18:33:36 +0100 Subject: [PATCH 078/225] Remove uneeded breaks from other examples too --- examples/c++-dataflow/node-rust-api/main.cc | 1 - examples/c-dataflow/node.c | 2 -- examples/c-dataflow/sink.c | 4 ---- examples/rust-dataflow/node/src/main.rs | 2 +- examples/rust-dataflow/sink/src/main.rs | 7 ++++--- 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/examples/c++-dataflow/node-rust-api/main.cc b/examples/c++-dataflow/node-rust-api/main.cc index df8542ca..caf11373 100644 --- a/examples/c++-dataflow/node-rust-api/main.cc +++ b/examples/c++-dataflow/node-rust-api/main.cc @@ -41,7 +41,6 @@ int main() else { std::cerr << "Unknown event type " << static_cast(ty) << std::endl; - return -1; } } diff --git a/examples/c-dataflow/node.c b/examples/c-dataflow/node.c index ff8949ae..e858baf1 100644 --- a/examples/c-dataflow/node.c +++ b/examples/c-dataflow/node.c @@ -49,8 +49,6 @@ int main() else if (ty == DoraEventType_Stop) { printf("[c node] received stop event\n"); - free_dora_event(event); - break; } else { diff --git a/examples/c-dataflow/sink.c b/examples/c-dataflow/sink.c index 486bab01..3d40894d 100644 --- a/examples/c-dataflow/sink.c +++ b/examples/c-dataflow/sink.c @@ -45,14 +45,10 @@ int main() else if (ty == DoraEventType_InputClosed) { printf("[c sink] received InputClosed event\n"); - free_dora_event(event); - break; } else if (ty == DoraEventType_Stop) { printf("[c sink] received stop event\n"); - free_dora_event(event); - break; } else { diff --git a/examples/rust-dataflow/node/src/main.rs b/examples/rust-dataflow/node/src/main.rs index 17f635b0..0bdabcea 100644 --- a/examples/rust-dataflow/node/src/main.rs +++ b/examples/rust-dataflow/node/src/main.rs @@ -14,7 +14,6 @@ fn main() -> eyre::Result<()> { }; match event { - Event::Stop => break, Event::Input { id, metadata, @@ -30,6 +29,7 @@ fn main() -> eyre::Result<()> { } other => eprintln!("Ignoring unexpected input `{other}`"), }, + Event::Stop => println!("Received manual stop"), other => eprintln!("Received unexpected input: {other:?}"), } } diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index 20a0931f..ef6f8f14 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -6,7 +6,6 @@ fn main() -> eyre::Result<()> { while let Some(event) = events.recv() { match event { - Event::Stop => break, Event::Input { id, metadata: _, @@ -20,9 +19,11 @@ fn main() -> eyre::Result<()> { } other => eprintln!("Ignoring unexpected input `{other}`"), }, + Event::Stop => { + println!("Received manual stop"); + } Event::InputClosed { id } => { - println!("Input `{id}` was closed -> exiting"); - break; + println!("Input `{id}` was closed"); } other => eprintln!("Received unexpected input: {other:?}"), } From 352fb3d75770c3148842eaebab784fb0916cecc0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 2 Jan 2023 18:36:33 +0100 Subject: [PATCH 079/225] Change Python event types to uppercase --- apis/python/node/src/lib.rs | 10 +++++----- examples/python-dataflow/no_webcam.py | 4 ++-- examples/python-dataflow/object_detection.py | 4 ++-- examples/python-dataflow/plot.py | 4 ++-- examples/python-dataflow/webcam.py | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 6d45bbf5..8e2be28c 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -24,7 +24,7 @@ impl IntoPy for PyInput<'_> { let dict = PyDict::new(py); let ty = match self.0 { - Event::Stop => "stop", + Event::Stop => "STOP", Event::Input { id, metadata, data } => { dict.set_item("id", id.to_string()) .wrap_err("failed to add input ID") @@ -38,21 +38,21 @@ impl IntoPy for PyInput<'_> { dict.set_item("metadata", metadata_to_pydict(&metadata, py)) .wrap_err("failed to add input metadata") .unwrap(); - "input" + "INPUT" } Event::InputClosed { id } => { dict.set_item("id", id.to_string()) .wrap_err("failed to add closed-input ID") .unwrap(); - "input-closed" + "INPUT_CLOSED" } Event::Error(err) => { dict.set_item("error", err) .wrap_err("failed to add error") .unwrap(); - "error" + "ERROR" } - _other => "unknown", + _other => "UNKNOWN", }; dict.set_item("type", ty) diff --git a/examples/python-dataflow/no_webcam.py b/examples/python-dataflow/no_webcam.py index 40acf8bb..01a99cb2 100755 --- a/examples/python-dataflow/no_webcam.py +++ b/examples/python-dataflow/no_webcam.py @@ -19,10 +19,10 @@ while time.time() - start < 20: # Wait next dora_input event = node.next() match event["type"]: - case "input": + case "INPUT": print("received input", event["id"]) node.send_output("image", arr.tobytes()) - case "stop": + case "STOP": print("received stop") case other: print("received unexpected event:", other) diff --git a/examples/python-dataflow/object_detection.py b/examples/python-dataflow/object_detection.py index b788cf75..3f07a18c 100755 --- a/examples/python-dataflow/object_detection.py +++ b/examples/python-dataflow/object_detection.py @@ -15,7 +15,7 @@ node = Node() for event in node: match event["type"]: - case "input": + case "INPUT": match event["id"]: case "image": print("received image input") @@ -28,7 +28,7 @@ for event in node: node.send_output("bbox", arrays, event["metadata"]) case other: print("ignoring unexpected input:", other) - case "stop": + case "STOP": print("received stop") case other: print("received unexpected event:", other) diff --git a/examples/python-dataflow/plot.py b/examples/python-dataflow/plot.py index 67a9b3db..e3723d94 100755 --- a/examples/python-dataflow/plot.py +++ b/examples/python-dataflow/plot.py @@ -92,7 +92,7 @@ node = Node() for event in node: match event["type"]: - case "input": + case "INPUT": status = plotter.on_input(event) match status: case Status.CONTINUE: @@ -100,7 +100,7 @@ for event in node: case Status.STOP: print("plotter returned stop status") break - case "stop": + case "STOP": print("received stop") case other: print("received unexpected event:", other) diff --git a/examples/python-dataflow/webcam.py b/examples/python-dataflow/webcam.py index 435fb5ec..cbcaedfc 100755 --- a/examples/python-dataflow/webcam.py +++ b/examples/python-dataflow/webcam.py @@ -17,11 +17,11 @@ while time.time() - start < 10: # Wait next dora_input event = node.next() match event["type"]: - case "input": + case "INPUT": ret, frame = video_capture.read() if ret: node.send_output("image", cv2.imencode(".jpg", frame)[1].tobytes()) - case "stop": + case "STOP": print("received stop") break case other: From 97b525a2f37c105c6193110b1fd76813e173243f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 5 Jan 2023 14:57:19 +0100 Subject: [PATCH 080/225] Update benchmark example to be comparable with latest main changes --- examples/benchmark/dataflow.yml | 7 +-- examples/benchmark/node/src/main.rs | 28 ++++++++++- examples/benchmark/sink/src/main.rs | 76 ++++++++++++++++------------- 3 files changed, 72 insertions(+), 39 deletions(-) diff --git a/examples/benchmark/dataflow.yml b/examples/benchmark/dataflow.yml index 87452f6c..382a6909 100644 --- a/examples/benchmark/dataflow.yml +++ b/examples/benchmark/dataflow.yml @@ -8,11 +8,12 @@ nodes: build: cargo build -p benchmark-example-node --release source: ../../target/release/benchmark-example-node outputs: - - random + - latency + - throughput - id: rust-sink custom: build: cargo build -p benchmark-example-sink --release source: ../../target/release/benchmark-example-sink inputs: - # message: runtime-node/rust-operator/status - message: rust-node/random + latency: rust-node/latency + throughput: rust-node/throughput diff --git a/examples/benchmark/node/src/main.rs b/examples/benchmark/node/src/main.rs index 14b63c7d..59dab3a1 100644 --- a/examples/benchmark/node/src/main.rs +++ b/examples/benchmark/node/src/main.rs @@ -1,8 +1,11 @@ +use std::time::Duration; + use dora_node_api::{self, dora_core::config::DataId, DoraNode}; use rand::Rng; fn main() -> eyre::Result<()> { - let output = DataId::from("random".to_owned()); + let latency = DataId::from("latency".to_owned()); + let throughput = DataId::from("throughput".to_owned()); let (mut node, _events) = DoraNode::init_from_env()?; let sizes = [ @@ -18,13 +21,34 @@ fn main() -> eyre::Result<()> { 1000 * 4096, 10000 * 4096, ]; + + // test latency first + for size in sizes { + for _ in 0..100 { + let data: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(size) + .collect(); + node.send_output(latency.clone(), Default::default(), data.len(), |out| { + out.copy_from_slice(&data); + })?; + + // sleep a bit to avoid queue buildup + std::thread::sleep(Duration::from_millis(10)); + } + } + + // wait a bit to ensure that all throughput messages reached their target + std::thread::sleep(Duration::from_secs(2)); + + // then throughput with full speed for size in sizes { for _ in 0..100 { let data: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) .take(size) .collect(); - node.send_output(output.clone(), Default::default(), data.len(), |out| { + node.send_output(throughput.clone(), Default::default(), data.len(), |out| { out.copy_from_slice(&data); })?; } diff --git a/examples/benchmark/sink/src/main.rs b/examples/benchmark/sink/src/main.rs index 5ad29240..9fea5acb 100644 --- a/examples/benchmark/sink/src/main.rs +++ b/examples/benchmark/sink/src/main.rs @@ -1,52 +1,58 @@ use dora_node_api::{self, daemon::Event, DoraNode}; -use eyre::ContextCompat; -use std::{ - fmt::Write as _, - time::{Duration, Instant}, -}; +use std::time::{Duration, Instant}; fn main() -> eyre::Result<()> { let (_node, mut events) = DoraNode::init_from_env()?; + // latency is tested first + let mut latency = true; + let mut current_size = 0; let mut n = 0; let mut start = Instant::now(); let mut latencies = Vec::new(); - let mut summary = String::new(); + println!("Latency:"); while let Some(event) = events.recv() { match event { - Event::Stop => break, - Event::Input { id, metadata, data } => match id.as_str() { - "message" => { - let data = data.as_deref().unwrap_or_default(); + Event::Input { id, metadata, data } => { + // check if new size bracket + let data_len = data.map(|d| d.len()).unwrap_or_default(); + if data_len != current_size { + if n > 0 { + record_results(start, current_size, n, latencies, latency); + } + current_size = data_len; + n = 0; + start = Instant::now(); + latencies = Vec::new(); + } - if data.len() != current_size { - if n > 0 { - record_results(start, current_size, n, latencies, &mut summary); - } - current_size = data.len(); - n = 0; - start = Instant::now(); - latencies = Vec::new(); + match id.as_str() { + "latency" if latency => {} + "throughput" if latency => { + latency = false; + println!("Throughput:"); + } + "throughput" => {} + other => { + eprintln!("Ignoring unexpected input `{other}`"); + continue; } - n += 1; - latencies.push(metadata.timestamp().get_time().to_system_time().elapsed()?); } - other => eprintln!("Ignoring unexpected input `{other}`"), - }, + + n += 1; + latencies.push(metadata.timestamp().get_time().to_system_time().elapsed()?); + } Event::InputClosed { id } => { - println!("Input `{id}` was closed -> exiting"); - break; + println!("Input `{id}` was closed"); } other => eprintln!("Received unexpected input: {other:?}"), } } - record_results(start, current_size, n, latencies, &mut summary); - - println!("\nSummary:\n{summary}"); + record_results(start, current_size, n, latencies, latency); Ok(()) } @@ -56,13 +62,15 @@ fn record_results( current_size: usize, n: u32, latencies: Vec, - summary: &mut String, + latency: bool, ) { - let duration = start.elapsed(); - let per_message = duration / n; - let avg_latency = latencies.iter().sum::() / n; - let msg = - format!("size {current_size:<#8x}: {per_message:?} per message (latency: {avg_latency:?})"); + let msg = if latency { + let avg_latency = latencies.iter().sum::() / n; + format!("size {current_size:<#8x}: {avg_latency:?}") + } else { + let duration = start.elapsed(); + let msg_per_sec = n as f64 / duration.as_secs_f64(); + format!("size {current_size:<#8x}: {msg_per_sec:.0} messages per second") + }; println!("{msg}"); - writeln!(summary, "{msg}").unwrap(); } From 70298d836fe58f9fa3c7a2bcc611346d9c0b7135 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 3 Jan 2023 17:52:43 +0100 Subject: [PATCH 081/225] Start migrating the daemon control channel to shared memory The goal is to avoid the latency of the TCP socket. --- Cargo.lock | 79 +++++++++------ apis/rust/node/Cargo.toml | 1 + apis/rust/node/src/daemon.rs | 91 ++++++++++------- apis/rust/node/src/lib.rs | 5 +- binaries/daemon/src/lib.rs | 18 ++-- binaries/daemon/src/listener.rs | 104 ++++++++++++++++++- binaries/daemon/src/spawn.rs | 30 ++++-- libraries/core/Cargo.toml | 3 + libraries/core/src/daemon_messages.rs | 22 ++++ libraries/core/src/lib.rs | 1 + libraries/core/src/shm_channel.rs | 138 ++++++++++++++++++++++++++ 11 files changed, 409 insertions(+), 83 deletions(-) create mode 100644 libraries/core/src/shm_channel.rs diff --git a/Cargo.lock b/Cargo.lock index dc121df2..acc4e7b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14,7 +14,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "cipher", "cpufeatures", "opaque-debug 0.3.0", @@ -141,7 +141,7 @@ checksum = "83137067e3a2a6a06d67168e49e68a0957d215410473a740cea95a2425c0b7c6" dependencies = [ "async-io", "blocking", - "cfg-if", + "cfg-if 1.0.0", "event-listener", "futures-lite", "libc", @@ -426,6 +426,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -672,7 +678,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -681,7 +687,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crossbeam-utils", ] @@ -691,7 +697,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crossbeam-epoch", "crossbeam-utils", ] @@ -703,7 +709,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ "autocfg 1.1.0", - "cfg-if", + "cfg-if 1.0.0", "crossbeam-utils", "lazy_static", "memoffset", @@ -716,7 +722,7 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "lazy_static", ] @@ -867,7 +873,7 @@ version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "num_cpus", ] @@ -905,7 +911,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "dirs-sys-next", ] @@ -973,12 +979,15 @@ dependencies = [ name = "dora-core" version = "0.1.2" dependencies = [ + "bincode", "dora-message", "eyre", "once_cell", + "raw_sync", "serde", "serde_yaml 0.9.11", "shared_memory", + "tracing", "uuid 1.2.1", "which", "zenoh-config", @@ -1064,6 +1073,7 @@ dependencies = [ "eyre", "flume", "once_cell", + "raw_sync", "serde", "serde_json", "serde_yaml 0.8.23", @@ -1197,7 +1207,7 @@ version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -1469,7 +1479,7 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "js-sys", "libc", "wasi 0.10.2+wasi-snapshot-preview1", @@ -1773,7 +1783,7 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -1910,7 +1920,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "winapi", ] @@ -1950,7 +1960,7 @@ version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "value-bag", ] @@ -2080,7 +2090,7 @@ name = "napi-build" version = "1.0.1" source = "git+https://github.com/getditto/napi-rs?branch=ditto/closure-into-jsfunction#da095cc3f1af133344083b525d7e9763b347e249" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "ureq", ] @@ -2244,7 +2254,7 @@ checksum = "e4916f159ed8e5de0082076562152a76b7a1f64a01fd9d1e0fea002c37624faf" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", "memoffset", ] @@ -2257,7 +2267,7 @@ checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", "memoffset", ] @@ -2394,7 +2404,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12fc0523e3bd51a692c8850d075d74dc062ccf251c0110668cbd921917118a13" dependencies = [ "bitflags", - "cfg-if", + "cfg-if 1.0.0", "foreign-types", "libc", "once_cell", @@ -2554,7 +2564,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "redox_syscall", "smallvec", @@ -2791,7 +2801,7 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "685404d509889fade3e86fe3a5803bca2ec09b0c0778d5ada6ec8bf7a8de5259" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "log", "wepoll-ffi", @@ -2923,7 +2933,7 @@ version = "0.16.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e6302e85060011447471887705bb7838f14aba43fcb06957d823739a496b3dc" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "eyre", "indoc", "libc", @@ -3084,6 +3094,19 @@ dependencies = [ "cty", ] +[[package]] +name = "raw_sync" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a34bde3561f980a51c70495164200569a11662644fe5af017f0b5d7015688cc" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "nix 0.23.1", + "rand", + "winapi", +] + [[package]] name = "rayon" version = "1.5.1" @@ -3560,7 +3583,7 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "681a9e90340f748af3a1cc52eb2c040eee29f976b763e99ad90fc0c5df6f9791" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "nix 0.22.3", "rand", @@ -3659,7 +3682,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af91f480ee899ab2d9f8435bfdfc14d08a5754bd9d3fef1f1a1c23336aad6c8b" dependencies = [ "async-channel", - "cfg-if", + "cfg-if 1.0.0", "futures-core", "pin-project-lite", ] @@ -3711,7 +3734,7 @@ version = "0.24.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d80929a3b477bce3a64360ca82bfb361eacce1dcb7b1fb31e8e5e181e37c212" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "core-foundation-sys", "libc", "ntapi", @@ -3738,7 +3761,7 @@ version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "fastrand", "libc", "redox_syscall", @@ -4026,7 +4049,7 @@ version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "log", "pin-project-lite", "tracing-attributes", @@ -4360,7 +4383,7 @@ version = "0.2.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "serde", "serde_json", "wasm-bindgen-macro", @@ -4387,7 +4410,7 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "js-sys", "wasm-bindgen", "web-sys", diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index 33680427..d685f34d 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -23,6 +23,7 @@ capnp = "0.14.11" dora-message = { path = "../../../libraries/message" } dora-core = { path = "../../../libraries/core" } shared_memory = "0.12.0" +raw_sync = "0.1.5" [dev-dependencies] tokio = { version = "1.17.0", features = ["rt"] } diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 03a272f3..3a11b142 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -8,6 +8,7 @@ use std::{ use dora_core::{ config::{DataId, NodeId}, daemon_messages::{ControlRequest, DataflowId, DropEvent, NodeEvent}, + shm_channel::ShmemChannel, }; use dora_message::Metadata; use eyre::{bail, eyre, Context}; @@ -19,11 +20,16 @@ pub struct DaemonConnection { } impl DaemonConnection { - pub fn init(dataflow_id: DataflowId, node_id: &NodeId, daemon_port: u16) -> eyre::Result { - let daemon_addr = (Ipv4Addr::new(127, 0, 0, 1), daemon_port).into(); - let control_channel = ControlChannel::init(daemon_addr, dataflow_id, node_id) + pub fn init( + dataflow_id: DataflowId, + node_id: &NodeId, + daemon_port: u16, + daemon_events_region_id: &str, + ) -> eyre::Result { + let control_channel = ControlChannel::init(dataflow_id, node_id, daemon_events_region_id) .wrap_err("failed to init control stream")?; + let daemon_addr = (Ipv4Addr::new(127, 0, 0, 1), daemon_port).into(); let event_stream = EventStream::init(daemon_addr, dataflow_id, node_id) .wrap_err("failed to init event stream")?; @@ -34,42 +40,53 @@ impl DaemonConnection { } } -pub struct ControlChannel(TcpStream); +pub struct ControlChannel { + channel: ShmemChannel, +} impl ControlChannel { + #[tracing::instrument] fn init( - daemon_addr: SocketAddr, dataflow_id: DataflowId, node_id: &NodeId, + daemon_events_region_id: &str, ) -> eyre::Result { - let mut control_stream = - TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; - control_stream - .set_nodelay(true) - .wrap_err("failed to set TCP_NODELAY")?; - tcp_send( - &mut control_stream, - &ControlRequest::Register { - dataflow_id, - node_id: node_id.clone(), - }, - ) - .wrap_err("failed to send register request to dora-daemon")?; - match tcp_receive(&mut control_stream) - .wrap_err("failed to receive register reply from dora-daemon")? - { + let daemon_events_region = ShmemConf::new() + .os_id(daemon_events_region_id) + .open() + .wrap_err("failed to connect to dora-daemon")?; + let mut channel = unsafe { ShmemChannel::new_client(daemon_events_region) } + .wrap_err("failed to create ShmemChannel")?; + + let msg = ControlRequest::Register { + dataflow_id, + node_id: node_id.clone(), + }; + channel + .send(&msg) + .wrap_err("failed to send register request to dora-daemon")?; + + // wait for reply + let reply = channel + .receive() + .wrap_err("failed to wait for receive register reply from dora-daemon")?; + match reply { dora_core::daemon_messages::ControlReply::Result(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to register node with dora-daemon")?, other => bail!("unexpected register reply: {other:?}"), } - Ok(Self(control_stream)) + + Ok(Self { channel }) } pub fn report_stop(&mut self) -> eyre::Result<()> { - tcp_send(&mut self.0, &ControlRequest::Stopped) + self.channel + .send(&ControlRequest::Stopped) .wrap_err("failed to report stopped to dora-daemon")?; - match tcp_receive(&mut self.0) + match self + .channel + .receive() .wrap_err("failed to receive stopped reply from dora-daemon")? { dora_core::daemon_messages::ControlReply::Result(result) => result @@ -86,16 +103,16 @@ impl ControlChannel { metadata: dora_message::Metadata<'static>, data_len: usize, ) -> eyre::Result { - tcp_send( - &mut self.0, - &ControlRequest::PrepareOutputMessage { + self.channel + .send(&ControlRequest::PrepareOutputMessage { output_id, metadata, data_len, - }, - ) - .wrap_err("failed to send PrepareOutputMessage request to dora-daemon")?; - match tcp_receive(&mut self.0) + }) + .wrap_err("failed to send PrepareOutputMessage request to dora-daemon")?; + match self + .channel + .receive() .wrap_err("failed to receive PrepareOutputMessage reply from dora-daemon")? { dora_core::daemon_messages::ControlReply::PreparedMessage { @@ -109,12 +126,12 @@ impl ControlChannel { } pub fn send_message(&mut self, sample: MessageSample) -> eyre::Result<()> { - tcp_send( - &mut self.0, - &ControlRequest::SendOutMessage { id: sample.id }, - ) - .wrap_err("failed to send SendOutMessage request to dora-daemon")?; - match tcp_receive(&mut self.0) + self.channel + .send(&ControlRequest::SendOutMessage { id: sample.id }) + .wrap_err("failed to send SendOutMessage request to dora-daemon")?; + match self + .channel + .receive() .wrap_err("failed to receive SendOutMessage reply from dora-daemon")? { dora_core::daemon_messages::ControlReply::Result(result) => { diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 51b97aa4..a6b26b2f 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -5,7 +5,7 @@ use dora_core::{ daemon_messages::NodeConfig, }; pub use dora_message::{uhlc, Metadata, MetadataParameters}; -use eyre::WrapErr; +use eyre::{eyre, WrapErr}; pub use flume::Receiver; use shared_memory::ShmemConf; @@ -37,12 +37,13 @@ impl DoraNode { node_id, run_config, daemon_port, + daemon_events_region_id, } = node_config; let DaemonConnection { control_channel, event_stream, - } = DaemonConnection::init(dataflow_id, &node_id, daemon_port) + } = DaemonConnection::init(dataflow_id, &node_id, daemon_port, &daemon_events_region_id) .wrap_err("failed to connect to dora-daemon")?; let node = Self { diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 7ac18f08..033effaa 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -45,7 +45,7 @@ pub struct Daemon { running: HashMap, - dora_events_tx: mpsc::Sender, + events_tx: mpsc::Sender, coordinator_addr: Option, machine_id: String, @@ -152,12 +152,12 @@ impl Daemon { prepared_messages: Default::default(), sent_out_shared_memory: Default::default(), running: HashMap::new(), - dora_events_tx, + events_tx: dora_events_tx, coordinator_addr, machine_id, exit_when_done, }; - let dora_events = ReceiverStream::new(dora_events_rx).map(Event::Dora); + let dora_events = ReceiverStream::new(dora_events_rx); let watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval( Duration::from_secs(5), )) @@ -326,12 +326,12 @@ impl Daemon { } } - spawn::spawn_node(dataflow_id, params, self.port, self.dora_events_tx.clone()) + spawn::spawn_node(dataflow_id, params, self.port, self.events_tx.clone()) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; } for interval in dataflow.timers.keys().copied() { - let events_tx = self.dora_events_tx.clone(); + let events_tx = self.events_tx.clone(); let task = async move { let mut interval_stream = tokio::time::interval(interval); let hlc = HLC::default(); @@ -346,7 +346,7 @@ impl Daemon { Default::default(), ), }; - if events_tx.send(event).await.is_err() { + if events_tx.send(event.into()).await.is_err() { break; } } @@ -669,6 +669,12 @@ pub enum Event { WatchdogInterval, } +impl From for Event { + fn from(event: DoraEvent) -> Self { + Event::Dora(event) + } +} + #[derive(Debug)] pub enum DaemonNodeEvent { PrepareOutputMessage { diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index ee75a4c5..cb633914 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -2,7 +2,10 @@ use crate::{ tcp_utils::{tcp_receive, tcp_send}, DaemonNodeEvent, Event, }; -use dora_core::daemon_messages::{self, DropEvent}; +use dora_core::{ + daemon_messages::{self, DropEvent}, + shm_channel::ShmemChannel, +}; use eyre::{eyre, Context}; use std::{io::ErrorKind, net::Ipv4Addr}; use tokio::{ @@ -142,6 +145,105 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende } } +#[tracing::instrument(skip(channel, events_tx))] +pub fn listener_loop(mut channel: ShmemChannel, events_tx: mpsc::Sender) { + let mut id = None; + let mut enter_subscribe_loop = None; + loop { + // receive the next message + let message = match channel.receive().wrap_err("failed to receive node message") { + Ok(m) => m, + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle the message and translate it to a NodeEvent + let node_event = match message { + daemon_messages::ControlRequest::Register { + dataflow_id, + node_id, + } => { + id = Some((dataflow_id, node_id)); + + let reply = daemon_messages::ControlReply::Result(Ok(())); + + match channel.send(&reply) { + Ok(()) => continue, // don't trigger an event for register calls + Err(err) => { + tracing::warn!("{err:?}"); + break; // close connection + } + } + } + daemon_messages::ControlRequest::Stopped => DaemonNodeEvent::Stopped, + daemon_messages::ControlRequest::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => DaemonNodeEvent::PrepareOutputMessage { + output_id, + metadata, + data_len, + }, + daemon_messages::ControlRequest::SendOutMessage { id } => { + DaemonNodeEvent::SendOutMessage { id } + } + daemon_messages::ControlRequest::Subscribe { + dataflow_id, + node_id, + } => { + let (tx, rx) = flume::bounded(10); + + id = Some((dataflow_id, node_id)); + enter_subscribe_loop = Some(rx); + + DaemonNodeEvent::Subscribe { event_sender: tx } + } + }; + + let (dataflow_id, node_id) = match &id { + Some(id) => id.clone(), + None => { + tracing::warn!( + "Ignoring node event because no register \ + message was sent yet: {node_event:?}" + ); + continue; + } + }; + + // send NodeEvent to daemon main loop + let (reply_tx, reply) = oneshot::channel(); + let event = Event::Node { + dataflow_id, + node_id, + event: node_event, + reply_sender: reply_tx, + }; + let Ok(()) = events_tx.blocking_send(event) else { + break; + }; + + // wait for reply and send it out + let Ok(reply) = reply.blocking_recv() else { + break; // main loop exited + }; + if let Err(err) = channel.send(&reply).wrap_err("failed to send reply") { + tracing::error!("{err:?}"); + break; + } + + // enter subscribe loop after receiving a subscribe message + if let Some(events) = enter_subscribe_loop { + todo!() + // subscribe_loop(connection, events, events_tx).await; + // break; // the subscribe loop only exits when the connection was closed + } + } +} + async fn subscribe_loop( connection: TcpStream, events: flume::Receiver, diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index bc94c466..e900ccf6 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,10 +1,12 @@ -use crate::DoraEvent; +use crate::{listener::listener_loop, DoraEvent, Event}; use dora_core::{ daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, + shm_channel::ShmemChannel, }; use dora_download::download_file; use eyre::{eyre, WrapErr}; +use shared_memory::ShmemConf; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; use tokio::sync::mpsc; @@ -13,7 +15,7 @@ pub async fn spawn_node( dataflow_id: DataflowId, params: SpawnNodeParams, daemon_port: u16, - result_tx: mpsc::Sender, + events_tx: mpsc::Sender, ) -> eyre::Result<()> { let SpawnNodeParams { node_id, @@ -36,12 +38,23 @@ pub async fn spawn_node( resolve_path(&node.source, &working_dir) .wrap_err_with(|| format!("failed to resolve node source `{}`", node.source))? }; + + let daemon_events_region = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to allocate daemon_events_region")?; let node_config = NodeConfig { dataflow_id, node_id: node_id.clone(), run_config: node.run_config.clone(), daemon_port, + daemon_events_region_id: daemon_events_region.get_os_id().to_owned(), }; + let channel = unsafe { ShmemChannel::new_server(daemon_events_region) } + .wrap_err("failed to create ShmemChannel")?; + + let result_tx = events_tx.clone(); + tokio::task::spawn_blocking(move || listener_loop(channel, events_tx)); let mut command = tokio::process::Command::new(&resolved_path); if let Some(args) = &node.args { @@ -82,13 +95,12 @@ pub async fn spawn_node( }; tokio::spawn(async move { let result = wait_task.await; - let _ = result_tx - .send(DoraEvent::SpawnedNodeResult { - dataflow_id, - node_id: node_id_cloned, - result, - }) - .await; + let event = DoraEvent::SpawnedNodeResult { + dataflow_id, + node_id: node_id_cloned, + result, + }; + let _ = result_tx.send(event.into()).await; }); Ok(()) } diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index ddf03223..6c7b347e 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -16,3 +16,6 @@ which = "4.3.0" uuid = { version = "1.2.1", features = ["serde"] } dora-message = { path = "../message" } shared_memory = "0.12.0" +bincode = "1.3.3" +raw_sync = "0.1.5" +tracing = "0.1" diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 6842d71a..1bf70f7d 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -5,6 +5,7 @@ use crate::{ descriptor, }; use dora_message::Metadata; +use eyre::Context; use uuid::Uuid; #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -13,6 +14,7 @@ pub struct NodeConfig { pub node_id: NodeId, pub run_config: NodeRunConfig, pub daemon_port: u16, + pub daemon_events_region_id: SharedMemoryId, } #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -36,6 +38,16 @@ pub enum ControlRequest { Stopped, } +impl ControlRequest { + pub fn serialize(&self) -> Vec { + bincode::serialize(self).unwrap() + } + + pub fn deserialize(data: &[u8]) -> eyre::Result { + bincode::deserialize(data).wrap_err("failed to deserialize ControlRequest") + } +} + type SharedMemoryId = String; #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -44,6 +56,16 @@ pub enum ControlReply { PreparedMessage { shared_memory_id: SharedMemoryId }, } +impl ControlReply { + pub fn serialize(&self) -> Vec { + bincode::serialize(self).unwrap() + } + + pub fn deserialize(data: &[u8]) -> eyre::Result { + bincode::deserialize(data).wrap_err("failed to deserialize ControlReply") + } +} + #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum NodeEvent { Stop, diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index a96517dc..61a1433b 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -8,6 +8,7 @@ pub mod config; pub mod coordinator_messages; pub mod daemon_messages; pub mod descriptor; +pub mod shm_channel; pub mod topics; pub fn adjust_shared_library_path(path: &Path) -> Result { diff --git a/libraries/core/src/shm_channel.rs b/libraries/core/src/shm_channel.rs new file mode 100644 index 00000000..549687b6 --- /dev/null +++ b/libraries/core/src/shm_channel.rs @@ -0,0 +1,138 @@ +use std::{mem, slice, sync::atomic::AtomicU64, time::Duration}; + +use eyre::{eyre, Context}; +use raw_sync::events::{Event, EventImpl, EventInit, EventState}; +use serde::{Deserialize, Serialize}; +use shared_memory::Shmem; + +/// Size of the encoded length in bytes. +const LEN_LEN: usize = mem::size_of::(); + +pub struct ShmemChannel { + memory: Shmem, + server_event: Box, + client_event: Box, + buffer_start_offset: usize, + server: bool, +} + +#[allow(clippy::missing_safety_doc)] +impl ShmemChannel { + pub unsafe fn new_server(memory: Shmem) -> eyre::Result { + let (server_event, server_event_len) = unsafe { Event::new(memory.as_ptr(), true) } + .map_err(|err| eyre!("failed to open raw server event: {err}"))?; + let (client_event, client_event_len) = + unsafe { Event::new(memory.as_ptr().wrapping_add(server_event_len), true) } + .map_err(|err| eyre!("failed to open raw client event: {err}"))?; + let buffer_start_offset = server_event_len + client_event_len; + + tracing::trace!( + "Initializing new ShmemChannel: buffer_start_offset: {buffer_start_offset}" + ); + + Ok(Self { + memory, + server_event, + client_event, + buffer_start_offset, + server: true, + }) + } + + pub unsafe fn new_client(memory: Shmem) -> eyre::Result { + let (server_event, offset) = unsafe { Event::from_existing(memory.as_ptr()) } + .map_err(|err| eyre!("failed to open raw server event: {err}"))?; + let (client_event, buffer_start_offset) = + unsafe { Event::from_existing(memory.as_ptr().wrapping_add(offset)) } + .map_err(|err| eyre!("failed to open raw client event: {err}"))?; + + Ok(Self { + memory, + server_event, + client_event, + buffer_start_offset, + server: false, + }) + } + + pub fn send(&mut self, value: &T) -> eyre::Result<()> + where + T: Serialize + std::fmt::Debug, + { + let msg = bincode::serialize(value).wrap_err("failed to serialize value")?; + tracing::debug!("sending message with length {}: {value:?}", msg.len()); + + let total_len = LEN_LEN + msg.len(); + assert!(total_len <= self.memory.len() - self.buffer_start_offset); + + // write data first + unsafe { + self.data_mut() + .copy_from_nonoverlapping(msg.as_ptr(), msg.len()); + } + + // write len second for synchronization + unsafe { + (*self.data_len()).store(msg.len() as u64, std::sync::atomic::Ordering::Release); + } + + // signal event + let event = if self.server { + &self.client_event + } else { + &self.server_event + }; + event + .set(EventState::Signaled) + .map_err(|err| eyre!("failed to send message over ShmemChannel: {err}"))?; + + Ok(()) + } + + pub fn receive(&mut self) -> eyre::Result + where + T: for<'a> Deserialize<'a> + std::fmt::Debug, + { + // wait for event + let (event, timeout) = if self.server { + (&self.server_event, raw_sync::Timeout::Infinite) + } else { + ( + &self.client_event, + raw_sync::Timeout::Val(Duration::from_secs(5)), + ) + }; + + event + .wait(timeout) + .map_err(|err| eyre!("failed to wait for reply from ShmemChannel: {err}"))?; + + // read len first for synchronization + let msg_len = + unsafe { &*self.data_len() }.load(std::sync::atomic::Ordering::Acquire) as usize; + assert!(msg_len < self.memory.len() - self.buffer_start_offset - LEN_LEN); + + let value_raw = unsafe { slice::from_raw_parts(self.data(), msg_len) }; + let msg = bincode::deserialize(value_raw).wrap_err("failed to deserialize value"); + tracing::debug!("received message with length {msg_len}: {msg:?}"); + msg + } + + fn data_len(&self) -> *const AtomicU64 { + self.data_len_ptr().cast() + } + + fn data_len_ptr(&self) -> *mut u8 { + self.memory.as_ptr().wrapping_add(self.buffer_start_offset) + } + + fn data(&self) -> *const u8 { + self.data_len_ptr().wrapping_add(LEN_LEN) + } + + fn data_mut(&mut self) -> *mut u8 { + self.data_len_ptr().wrapping_add(LEN_LEN) + } +} + +unsafe impl Send for ShmemChannel {} From be6c459c6ce5660a49962b4fae635cd4444f66da Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 4 Jan 2023 19:52:51 +0100 Subject: [PATCH 082/225] Set log level to DEBUG for daemon --- binaries/daemon/src/main.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index 8befc2f8..dbb8f24d 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -2,6 +2,8 @@ use dora_core::topics::DORA_COORDINATOR_PORT_DEFAULT; use dora_daemon::Daemon; use eyre::Context; use std::{net::Ipv4Addr, path::PathBuf}; +use tracing::metadata::LevelFilter; +use tracing_subscriber::Layer; #[derive(Debug, Clone, clap::Parser)] #[clap(about = "Dora daemon")] @@ -43,7 +45,9 @@ async fn run() -> eyre::Result<()> { fn set_up_tracing() -> eyre::Result<()> { use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; - let stdout_log = tracing_subscriber::fmt::layer().pretty(); + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(LevelFilter::DEBUG); let subscriber = tracing_subscriber::Registry::default().with(stdout_log); tracing::subscriber::set_global_default(subscriber) .context("failed to set tracing global subscriber") From 6253d6e5ad83765505c35628413c489d1bf2e37a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 4 Jan 2023 19:53:48 +0100 Subject: [PATCH 083/225] Fix: Initialize events to 'clear' state and length to 0 --- libraries/core/src/shm_channel.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/libraries/core/src/shm_channel.rs b/libraries/core/src/shm_channel.rs index 549687b6..f31b039f 100644 --- a/libraries/core/src/shm_channel.rs +++ b/libraries/core/src/shm_channel.rs @@ -26,9 +26,19 @@ impl ShmemChannel { .map_err(|err| eyre!("failed to open raw client event: {err}"))?; let buffer_start_offset = server_event_len + client_event_len; - tracing::trace!( - "Initializing new ShmemChannel: buffer_start_offset: {buffer_start_offset}" - ); + server_event + .set(EventState::Clear) + .map_err(|err| eyre!("failed to init server_event: {err}"))?; + client_event + .set(EventState::Clear) + .map_err(|err| eyre!("failed to init client_event: {err}"))?; + unsafe { + memory + .as_ptr() + .wrapping_add(buffer_start_offset) + .cast::() + .write(AtomicU64::new(0)); + } Ok(Self { memory, From f04960d6809b77c58458afb2075ead3450a4c148 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 4 Jan 2023 19:54:24 +0100 Subject: [PATCH 084/225] Fix event + data offsets for Shmem channel client --- libraries/core/src/shm_channel.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libraries/core/src/shm_channel.rs b/libraries/core/src/shm_channel.rs index f31b039f..56e70fb2 100644 --- a/libraries/core/src/shm_channel.rs +++ b/libraries/core/src/shm_channel.rs @@ -50,11 +50,12 @@ impl ShmemChannel { } pub unsafe fn new_client(memory: Shmem) -> eyre::Result { - let (server_event, offset) = unsafe { Event::from_existing(memory.as_ptr()) } + let (server_event, server_event_len) = unsafe { Event::from_existing(memory.as_ptr()) } .map_err(|err| eyre!("failed to open raw server event: {err}"))?; - let (client_event, buffer_start_offset) = - unsafe { Event::from_existing(memory.as_ptr().wrapping_add(offset)) } + let (client_event, client_event_len) = + unsafe { Event::from_existing(memory.as_ptr().wrapping_add(server_event_len)) } .map_err(|err| eyre!("failed to open raw client event: {err}"))?; + let buffer_start_offset = server_event_len + client_event_len; Ok(Self { memory, From 6ae75c04bd2f0795b062b48442296881290e2929 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 4 Jan 2023 19:54:48 +0100 Subject: [PATCH 085/225] Remove some log messages --- libraries/core/src/shm_channel.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libraries/core/src/shm_channel.rs b/libraries/core/src/shm_channel.rs index 56e70fb2..bc3b6d8b 100644 --- a/libraries/core/src/shm_channel.rs +++ b/libraries/core/src/shm_channel.rs @@ -71,7 +71,6 @@ impl ShmemChannel { T: Serialize + std::fmt::Debug, { let msg = bincode::serialize(value).wrap_err("failed to serialize value")?; - tracing::debug!("sending message with length {}: {value:?}", msg.len()); let total_len = LEN_LEN + msg.len(); assert!(total_len <= self.memory.len() - self.buffer_start_offset); @@ -124,9 +123,8 @@ impl ShmemChannel { assert!(msg_len < self.memory.len() - self.buffer_start_offset - LEN_LEN); let value_raw = unsafe { slice::from_raw_parts(self.data(), msg_len) }; - let msg = bincode::deserialize(value_raw).wrap_err("failed to deserialize value"); - tracing::debug!("received message with length {msg_len}: {msg:?}"); - msg + + bincode::deserialize(value_raw).wrap_err("failed to deserialize value") } fn data_len(&self) -> *const AtomicU64 { From 8d1c3742a1b67a52189e5b230b4accabdd500586 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 5 Jan 2023 15:49:13 +0100 Subject: [PATCH 086/225] Implement disconnect signal for `ShmemChannel` and set it on drop --- apis/rust/node/src/daemon.rs | 6 +- binaries/daemon/src/listener.rs | 6 +- libraries/core/src/shm_channel.rs | 115 ++++++++++++++++++++++-------- 3 files changed, 96 insertions(+), 31 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 3a11b142..aefe82d1 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -69,7 +69,8 @@ impl ControlChannel { // wait for reply let reply = channel .receive() - .wrap_err("failed to wait for receive register reply from dora-daemon")?; + .wrap_err("failed to wait for receive register reply from dora-daemon")? + .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))?; match reply { dora_core::daemon_messages::ControlReply::Result(result) => result .map_err(|e| eyre!(e)) @@ -88,6 +89,7 @@ impl ControlChannel { .channel .receive() .wrap_err("failed to receive stopped reply from dora-daemon")? + .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))? { dora_core::daemon_messages::ControlReply::Result(result) => result .map_err(|e| eyre!(e)) @@ -114,6 +116,7 @@ impl ControlChannel { .channel .receive() .wrap_err("failed to receive PrepareOutputMessage reply from dora-daemon")? + .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))? { dora_core::daemon_messages::ControlReply::PreparedMessage { shared_memory_id: id, @@ -133,6 +136,7 @@ impl ControlChannel { .channel .receive() .wrap_err("failed to receive SendOutMessage reply from dora-daemon")? + .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))? { dora_core::daemon_messages::ControlReply::Result(result) => { result.map_err(|err| eyre!(err)) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index cb633914..a0f26bee 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -152,7 +152,11 @@ pub fn listener_loop(mut channel: ShmemChannel, events_tx: mpsc::Sender) loop { // receive the next message let message = match channel.receive().wrap_err("failed to receive node message") { - Ok(m) => m, + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!("control channel disconnected: {id:?}"); + break; + } // disconnected Err(err) => { tracing::warn!("{err:?}"); continue; diff --git a/libraries/core/src/shm_channel.rs b/libraries/core/src/shm_channel.rs index bc3b6d8b..8a995e26 100644 --- a/libraries/core/src/shm_channel.rs +++ b/libraries/core/src/shm_channel.rs @@ -1,18 +1,21 @@ -use std::{mem, slice, sync::atomic::AtomicU64, time::Duration}; +use std::{ + mem, slice, + sync::atomic::{AtomicBool, AtomicU64}, + time::Duration, +}; use eyre::{eyre, Context}; use raw_sync::events::{Event, EventImpl, EventInit, EventState}; use serde::{Deserialize, Serialize}; use shared_memory::Shmem; -/// Size of the encoded length in bytes. -const LEN_LEN: usize = mem::size_of::(); - pub struct ShmemChannel { memory: Shmem, server_event: Box, client_event: Box, - buffer_start_offset: usize, + disconnect_offset: usize, + len_offset: usize, + data_offset: usize, server: bool, } @@ -24,7 +27,8 @@ impl ShmemChannel { let (client_event, client_event_len) = unsafe { Event::new(memory.as_ptr().wrapping_add(server_event_len), true) } .map_err(|err| eyre!("failed to open raw client event: {err}"))?; - let buffer_start_offset = server_event_len + client_event_len; + let (disconnect_offset, len_offset, data_offset) = + offsets(server_event_len, client_event_len); server_event .set(EventState::Clear) @@ -35,7 +39,14 @@ impl ShmemChannel { unsafe { memory .as_ptr() - .wrapping_add(buffer_start_offset) + .wrapping_add(disconnect_offset) + .cast::() + .write(AtomicBool::new(false)); + } + unsafe { + memory + .as_ptr() + .wrapping_add(len_offset) .cast::() .write(AtomicU64::new(0)); } @@ -44,7 +55,9 @@ impl ShmemChannel { memory, server_event, client_event, - buffer_start_offset, + disconnect_offset, + len_offset, + data_offset, server: true, }) } @@ -55,13 +68,16 @@ impl ShmemChannel { let (client_event, client_event_len) = unsafe { Event::from_existing(memory.as_ptr().wrapping_add(server_event_len)) } .map_err(|err| eyre!("failed to open raw client event: {err}"))?; - let buffer_start_offset = server_event_len + client_event_len; + let (disconnect_offset, len_offset, data_offset) = + offsets(server_event_len, client_event_len); Ok(Self { memory, server_event, client_event, - buffer_start_offset, + disconnect_offset, + len_offset, + data_offset, server: false, }) } @@ -72,19 +88,19 @@ impl ShmemChannel { { let msg = bincode::serialize(value).wrap_err("failed to serialize value")?; - let total_len = LEN_LEN + msg.len(); - assert!(total_len <= self.memory.len() - self.buffer_start_offset); + self.send_raw(&msg) + } + fn send_raw(&mut self, msg: &[u8]) -> Result<(), eyre::ErrReport> { + assert!(msg.len() <= self.memory.len() - self.data_offset); // write data first unsafe { self.data_mut() .copy_from_nonoverlapping(msg.as_ptr(), msg.len()); } - // write len second for synchronization - unsafe { - (*self.data_len()).store(msg.len() as u64, std::sync::atomic::Ordering::Release); - } + self.data_len() + .store(msg.len() as u64, std::sync::atomic::Ordering::Release); // signal event let event = if self.server { @@ -95,11 +111,10 @@ impl ShmemChannel { event .set(EventState::Signaled) .map_err(|err| eyre!("failed to send message over ShmemChannel: {err}"))?; - Ok(()) } - pub fn receive(&mut self) -> eyre::Result + pub fn receive(&mut self) -> eyre::Result> where T: for<'a> Deserialize<'a> + std::fmt::Debug, { @@ -117,31 +132,73 @@ impl ShmemChannel { .wait(timeout) .map_err(|err| eyre!("failed to wait for reply from ShmemChannel: {err}"))?; - // read len first for synchronization - let msg_len = - unsafe { &*self.data_len() }.load(std::sync::atomic::Ordering::Acquire) as usize; - assert!(msg_len < self.memory.len() - self.buffer_start_offset - LEN_LEN); + // check for disconnect first + if self.disconnect().load(std::sync::atomic::Ordering::Acquire) { + return Ok(None); + } + + // then read len for synchronization + let msg_len = self.data_len().load(std::sync::atomic::Ordering::Acquire) as usize; + assert!(msg_len < self.memory.len() - self.data_offset); + // finally read the data let value_raw = unsafe { slice::from_raw_parts(self.data(), msg_len) }; - bincode::deserialize(value_raw).wrap_err("failed to deserialize value") + bincode::deserialize(value_raw) + .wrap_err("failed to deserialize value") + .map(|v| Some(v)) } - fn data_len(&self) -> *const AtomicU64 { - self.data_len_ptr().cast() + fn disconnect(&self) -> &AtomicBool { + unsafe { + &*self + .memory + .as_ptr() + .wrapping_add(self.disconnect_offset) + .cast::() + } } - fn data_len_ptr(&self) -> *mut u8 { - self.memory.as_ptr().wrapping_add(self.buffer_start_offset) + fn data_len(&self) -> &AtomicU64 { + unsafe { + &*self + .memory + .as_ptr() + .wrapping_add(self.len_offset) + .cast::() + } } fn data(&self) -> *const u8 { - self.data_len_ptr().wrapping_add(LEN_LEN) + self.memory.as_ptr().wrapping_add(self.data_offset) } fn data_mut(&mut self) -> *mut u8 { - self.data_len_ptr().wrapping_add(LEN_LEN) + self.memory.as_ptr().wrapping_add(self.data_offset) } } +fn offsets(server_event_len: usize, client_event_len: usize) -> (usize, usize, usize) { + let disconnect_offset = server_event_len + client_event_len; + let len_offset = disconnect_offset + mem::size_of::(); + let data_offset = len_offset + mem::size_of::(); + (disconnect_offset, len_offset, data_offset) +} + unsafe impl Send for ShmemChannel {} + +impl Drop for ShmemChannel { + fn drop(&mut self) { + self.disconnect() + .store(true, std::sync::atomic::Ordering::Release); + // wake up other end + let event = if self.server { + &self.client_event + } else { + &self.server_event + }; + if let Err(err) = event.set(EventState::Signaled) { + tracing::warn!("failed to signal ShmemChannel disconnect: {err}"); + } + } +} From 670c154e5e8d321c0cb817eea5af1168ff0810f9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 5 Jan 2023 16:20:27 +0100 Subject: [PATCH 087/225] Create a safer server/client API on top of `ShmemChannel` --- apis/rust/node/src/daemon.rs | 51 +++++--------- binaries/daemon/src/listener.rs | 10 +-- binaries/daemon/src/spawn.rs | 4 +- libraries/core/src/lib.rs | 2 +- .../channel.rs} | 9 ++- libraries/core/src/shared_memory/mod.rs | 66 +++++++++++++++++++ 6 files changed, 95 insertions(+), 47 deletions(-) rename libraries/core/src/{shm_channel.rs => shared_memory/channel.rs} (99%) create mode 100644 libraries/core/src/shared_memory/mod.rs diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index aefe82d1..725f23b2 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -8,7 +8,7 @@ use std::{ use dora_core::{ config::{DataId, NodeId}, daemon_messages::{ControlRequest, DataflowId, DropEvent, NodeEvent}, - shm_channel::ShmemChannel, + shared_memory::ShmemClient, }; use dora_message::Metadata; use eyre::{bail, eyre, Context}; @@ -41,7 +41,7 @@ impl DaemonConnection { } pub struct ControlChannel { - channel: ShmemChannel, + channel: ShmemClient, } impl ControlChannel { @@ -55,22 +55,17 @@ impl ControlChannel { .os_id(daemon_events_region_id) .open() .wrap_err("failed to connect to dora-daemon")?; - let mut channel = unsafe { ShmemChannel::new_client(daemon_events_region) } + let mut channel = unsafe { ShmemClient::new(daemon_events_region) } .wrap_err("failed to create ShmemChannel")?; let msg = ControlRequest::Register { dataflow_id, node_id: node_id.clone(), }; - channel - .send(&msg) + let reply = channel + .request(&msg) .wrap_err("failed to send register request to dora-daemon")?; - // wait for reply - let reply = channel - .receive() - .wrap_err("failed to wait for receive register reply from dora-daemon")? - .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))?; match reply { dora_core::daemon_messages::ControlReply::Result(result) => result .map_err(|e| eyre!(e)) @@ -82,15 +77,11 @@ impl ControlChannel { } pub fn report_stop(&mut self) -> eyre::Result<()> { - self.channel - .send(&ControlRequest::Stopped) - .wrap_err("failed to report stopped to dora-daemon")?; - match self + let reply = self .channel - .receive() - .wrap_err("failed to receive stopped reply from dora-daemon")? - .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))? - { + .request(&ControlRequest::Stopped) + .wrap_err("failed to report stopped to dora-daemon")?; + match reply { dora_core::daemon_messages::ControlReply::Result(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to report stop event to dora-daemon")?, @@ -105,19 +96,15 @@ impl ControlChannel { metadata: dora_message::Metadata<'static>, data_len: usize, ) -> eyre::Result { - self.channel - .send(&ControlRequest::PrepareOutputMessage { + let reply = self + .channel + .request(&ControlRequest::PrepareOutputMessage { output_id, metadata, data_len, }) .wrap_err("failed to send PrepareOutputMessage request to dora-daemon")?; - match self - .channel - .receive() - .wrap_err("failed to receive PrepareOutputMessage reply from dora-daemon")? - .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))? - { + match reply { dora_core::daemon_messages::ControlReply::PreparedMessage { shared_memory_id: id, } => Ok(MessageSample { id }), @@ -129,15 +116,11 @@ impl ControlChannel { } pub fn send_message(&mut self, sample: MessageSample) -> eyre::Result<()> { - self.channel - .send(&ControlRequest::SendOutMessage { id: sample.id }) - .wrap_err("failed to send SendOutMessage request to dora-daemon")?; - match self + let reply = self .channel - .receive() - .wrap_err("failed to receive SendOutMessage reply from dora-daemon")? - .ok_or_else(|| eyre!("daemon disconnected unexpectedly"))? - { + .request(&ControlRequest::SendOutMessage { id: sample.id }) + .wrap_err("failed to send SendOutMessage request to dora-daemon")?; + match reply { dora_core::daemon_messages::ControlReply::Result(result) => { result.map_err(|err| eyre!(err)) } diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index a0f26bee..f252daea 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -4,7 +4,7 @@ use crate::{ }; use dora_core::{ daemon_messages::{self, DropEvent}, - shm_channel::ShmemChannel, + shared_memory::ShmemServer, }; use eyre::{eyre, Context}; use std::{io::ErrorKind, net::Ipv4Addr}; @@ -146,12 +146,12 @@ pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sende } #[tracing::instrument(skip(channel, events_tx))] -pub fn listener_loop(mut channel: ShmemChannel, events_tx: mpsc::Sender) { +pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { let mut id = None; let mut enter_subscribe_loop = None; loop { // receive the next message - let message = match channel.receive().wrap_err("failed to receive node message") { + let message = match channel.listen().wrap_err("failed to receive node message") { Ok(Some(m)) => m, Ok(None) => { tracing::info!("control channel disconnected: {id:?}"); @@ -173,7 +173,7 @@ pub fn listener_loop(mut channel: ShmemChannel, events_tx: mpsc::Sender) let reply = daemon_messages::ControlReply::Result(Ok(())); - match channel.send(&reply) { + match channel.send_reply(&reply) { Ok(()) => continue, // don't trigger an event for register calls Err(err) => { tracing::warn!("{err:?}"); @@ -234,7 +234,7 @@ pub fn listener_loop(mut channel: ShmemChannel, events_tx: mpsc::Sender) let Ok(reply) = reply.blocking_recv() else { break; // main loop exited }; - if let Err(err) = channel.send(&reply).wrap_err("failed to send reply") { + if let Err(err) = channel.send_reply(&reply).wrap_err("failed to send reply") { tracing::error!("{err:?}"); break; } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index e900ccf6..cbf7bed6 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -2,7 +2,7 @@ use crate::{listener::listener_loop, DoraEvent, Event}; use dora_core::{ daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, - shm_channel::ShmemChannel, + shared_memory::ShmemServer, }; use dora_download::download_file; use eyre::{eyre, WrapErr}; @@ -50,7 +50,7 @@ pub async fn spawn_node( daemon_port, daemon_events_region_id: daemon_events_region.get_os_id().to_owned(), }; - let channel = unsafe { ShmemChannel::new_server(daemon_events_region) } + let channel = unsafe { ShmemServer::new(daemon_events_region) } .wrap_err("failed to create ShmemChannel")?; let result_tx = events_tx.clone(); diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index 61a1433b..52e82393 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -8,7 +8,7 @@ pub mod config; pub mod coordinator_messages; pub mod daemon_messages; pub mod descriptor; -pub mod shm_channel; +pub mod shared_memory; pub mod topics; pub fn adjust_shared_library_path(path: &Path) -> Result { diff --git a/libraries/core/src/shm_channel.rs b/libraries/core/src/shared_memory/channel.rs similarity index 99% rename from libraries/core/src/shm_channel.rs rename to libraries/core/src/shared_memory/channel.rs index 8a995e26..5fbc1d43 100644 --- a/libraries/core/src/shm_channel.rs +++ b/libraries/core/src/shared_memory/channel.rs @@ -1,14 +1,13 @@ +use eyre::{eyre, Context}; +use raw_sync::events::{Event, EventImpl, EventInit, EventState}; +use serde::{Deserialize, Serialize}; +use shared_memory::Shmem; use std::{ mem, slice, sync::atomic::{AtomicBool, AtomicU64}, time::Duration, }; -use eyre::{eyre, Context}; -use raw_sync::events::{Event, EventImpl, EventInit, EventState}; -use serde::{Deserialize, Serialize}; -use shared_memory::Shmem; - pub struct ShmemChannel { memory: Shmem, server_event: Box, diff --git a/libraries/core/src/shared_memory/mod.rs b/libraries/core/src/shared_memory/mod.rs new file mode 100644 index 00000000..a5b87c7f --- /dev/null +++ b/libraries/core/src/shared_memory/mod.rs @@ -0,0 +1,66 @@ +use self::channel::ShmemChannel; +use eyre::eyre; +use serde::{Deserialize, Serialize}; +use shared_memory::Shmem; + +mod channel; + +pub struct ShmemServer { + channel: ShmemChannel, + reply_expected: bool, +} + +impl ShmemServer { + pub unsafe fn new(memory: Shmem) -> eyre::Result { + Ok(Self { + channel: ShmemChannel::new_server(memory)?, + reply_expected: false, + }) + } + + pub fn listen(&mut self) -> eyre::Result> + where + T: for<'a> Deserialize<'a> + std::fmt::Debug, + { + assert!(!self.reply_expected); + let result = self.channel.receive(); + if matches!(result, Ok(Some(_))) { + self.reply_expected = true; + } + + result + } + + pub fn send_reply(&mut self, value: &T) -> eyre::Result<()> + where + T: Serialize + std::fmt::Debug, + { + assert!(self.reply_expected); + self.channel.send(value)?; + self.reply_expected = false; + Ok(()) + } +} + +pub struct ShmemClient { + channel: ShmemChannel, +} + +impl ShmemClient { + pub unsafe fn new(memory: Shmem) -> eyre::Result { + Ok(Self { + channel: ShmemChannel::new_client(memory)?, + }) + } + + pub fn request(&mut self, value: &T) -> eyre::Result + where + T: Serialize + std::fmt::Debug, + U: for<'a> Deserialize<'a> + std::fmt::Debug, + { + self.channel.send(value)?; + self.channel + .receive()? + .ok_or_else(|| eyre!("server disconnected unexpectedly")) + } +} From 503a9497fdbaed2d82a8280a545f7d25db7f91ea Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 15:36:15 +0100 Subject: [PATCH 088/225] Use shared memory for event stream too --- apis/rust/node/src/daemon.rs | 92 +++------- apis/rust/node/src/lib.rs | 13 +- binaries/daemon/src/lib.rs | 64 ++----- binaries/daemon/src/listener.rs | 236 ++++---------------------- binaries/daemon/src/spawn.rs | 17 +- libraries/core/src/daemon_messages.rs | 7 +- 6 files changed, 98 insertions(+), 331 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 725f23b2..af60ec83 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,18 +1,12 @@ -use std::{ - io::{ErrorKind, Read, Write}, - marker::PhantomData, - net::{Ipv4Addr, SocketAddr, TcpStream}, - time::Duration, -}; - use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{ControlRequest, DataflowId, DropEvent, NodeEvent}, + daemon_messages::{ControlRequest, DataflowId, NodeEvent}, shared_memory::ShmemClient, }; use dora_message::Metadata; use eyre::{bail, eyre, Context}; use shared_memory::{Shmem, ShmemConf}; +use std::{marker::PhantomData, time::Duration}; pub struct DaemonConnection { pub control_channel: ControlChannel, @@ -23,14 +17,13 @@ impl DaemonConnection { pub fn init( dataflow_id: DataflowId, node_id: &NodeId, - daemon_port: u16, + daemon_control_region_id: &str, daemon_events_region_id: &str, ) -> eyre::Result { - let control_channel = ControlChannel::init(dataflow_id, node_id, daemon_events_region_id) + let control_channel = ControlChannel::init(dataflow_id, node_id, daemon_control_region_id) .wrap_err("failed to init control stream")?; - let daemon_addr = (Ipv4Addr::new(127, 0, 0, 1), daemon_port).into(); - let event_stream = EventStream::init(daemon_addr, dataflow_id, node_id) + let event_stream = EventStream::init(dataflow_id, node_id, daemon_events_region_id) .wrap_err("failed to init event stream")?; Ok(Self { @@ -49,10 +42,10 @@ impl ControlChannel { fn init( dataflow_id: DataflowId, node_id: &NodeId, - daemon_events_region_id: &str, + daemon_control_region_id: &str, ) -> eyre::Result { let daemon_events_region = ShmemConf::new() - .os_id(daemon_events_region_id) + .os_id(daemon_control_region_id) .open() .wrap_err("failed to connect to dora-daemon")?; let mut channel = unsafe { ShmemClient::new(daemon_events_region) } @@ -135,37 +128,32 @@ pub struct EventStream { impl EventStream { fn init( - daemon_addr: SocketAddr, dataflow_id: DataflowId, node_id: &NodeId, + daemon_events_region_id: &str, ) -> eyre::Result { - let mut event_stream = - TcpStream::connect(daemon_addr).wrap_err("failed to connect to dora-daemon")?; - event_stream - .set_nodelay(true) - .wrap_err("failed to set TCP_NODELAY")?; - tcp_send( - &mut event_stream, - &ControlRequest::Subscribe { + let daemon_events_region = ShmemConf::new() + .os_id(daemon_events_region_id) + .open() + .wrap_err("failed to connect to dora-daemon")?; + let mut channel = unsafe { ShmemClient::new(daemon_events_region) } + .wrap_err("failed to create ShmemChannel")?; + + channel + .request(&ControlRequest::Subscribe { dataflow_id, node_id: node_id.clone(), - }, - ) - .wrap_err("failed to send subscribe request to dora-daemon")?; - match tcp_receive(&mut event_stream) - .wrap_err("failed to receive subscribe reply from dora-daemon")? - { - dora_core::daemon_messages::ControlReply::Result(result) => result - .map_err(|e| eyre!(e)) - .wrap_err("failed to create subscription with dora-daemon")?, - other => bail!("unexpected subscribe reply: {other:?}"), - } + }) + .map_err(|e| eyre!(e)) + .wrap_err("failed to create subscription with dora-daemon")?; let (tx, rx) = flume::bounded(1); + let mut drop_tokens = Vec::new(); std::thread::spawn(move || loop { - let event: NodeEvent = match tcp_receive(&mut event_stream) { + let event: NodeEvent = match channel.request(&ControlRequest::NextEvent { + drop_tokens: std::mem::take(&mut drop_tokens), + }) { Ok(event) => event, - Err(err) if err.kind() == ErrorKind::UnexpectedEof => break, Err(err) => { let err = eyre!(err).wrap_err("failed to receive incoming event"); tracing::warn!("{err:?}"); @@ -199,11 +187,7 @@ impl EventStream { } if let Some(token) = drop_token { - let message = DropEvent { token }; - if let Err(err) = tcp_send(&mut event_stream, &message) { - tracing::warn!("failed to send drop token: {err}"); - break; - } + drop_tokens.push(token); } }); @@ -305,29 +289,3 @@ impl std::ops::Deref for MappedInputData<'_> { unsafe { &self.memory.as_slice()[..self.len] } } } - -fn tcp_send(connection: &mut TcpStream, request: &T) -> std::io::Result<()> { - let serialized = serde_json::to_vec(request)?; - - let len_raw = (serialized.len() as u64).to_le_bytes(); - connection.write_all(&len_raw)?; - connection.write_all(&serialized)?; - Ok(()) -} - -fn tcp_receive(connection: &mut TcpStream) -> std::io::Result -where - T: for<'a> serde::Deserialize<'a>, -{ - let reply_len = { - let mut raw = [0; 8]; - connection.read_exact(&mut raw)?; - u64::from_le_bytes(raw) as usize - }; - let mut reply_raw = vec![0; reply_len]; - connection.read_exact(&mut reply_raw)?; - - let reply = serde_json::from_slice(&reply_raw)?; - - Ok(reply) -} diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index a6b26b2f..46b3c1e4 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -5,7 +5,7 @@ use dora_core::{ daemon_messages::NodeConfig, }; pub use dora_message::{uhlc, Metadata, MetadataParameters}; -use eyre::{eyre, WrapErr}; +use eyre::WrapErr; pub use flume::Receiver; use shared_memory::ShmemConf; @@ -36,15 +36,20 @@ impl DoraNode { dataflow_id, node_id, run_config, - daemon_port, + daemon_control_region_id, daemon_events_region_id, } = node_config; let DaemonConnection { control_channel, event_stream, - } = DaemonConnection::init(dataflow_id, &node_id, daemon_port, &daemon_events_region_id) - .wrap_err("failed to connect to dora-daemon")?; + } = DaemonConnection::init( + dataflow_id, + &node_id, + &daemon_control_region_id, + &daemon_events_region_id, + ) + .wrap_err("failed to connect to dora-daemon")?; let node = Self { id: node_id, diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 033effaa..a1a79014 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -23,14 +23,10 @@ use std::{ use tcp_utils::tcp_receive; use tokio::{ fs, - net::TcpStream, sync::{mpsc, oneshot}, time::timeout, }; -use tokio_stream::{ - wrappers::{ReceiverStream, TcpListenerStream}, - Stream, StreamExt, -}; +use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use uuid::Uuid; mod coordinator; @@ -39,7 +35,6 @@ mod spawn; mod tcp_utils; pub struct Daemon { - port: u16, prepared_messages: HashMap, sent_out_shared_memory: HashMap>, @@ -133,22 +128,8 @@ impl Daemon { machine_id: String, exit_when_done: Option>, ) -> eyre::Result<()> { - // create listener for node connection - let listener = listener::create_listener().await?; - let port = listener - .local_addr() - .wrap_err("failed to get local addr of listener")? - .port(); - let new_connections = TcpListenerStream::new(listener).map(|c| { - c.map(Event::NewConnection) - .wrap_err("failed to open connection") - .unwrap_or_else(Event::ConnectError) - }); - tracing::info!("Listening for node connections on 127.0.0.1:{port}"); - let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let daemon = Self { - port, prepared_messages: Default::default(), sent_out_shared_memory: Default::default(), running: HashMap::new(), @@ -162,13 +143,7 @@ impl Daemon { Duration::from_secs(5), )) .map(|_| Event::WatchdogInterval); - let events = ( - external_events, - new_connections, - dora_events, - watchdog_interval, - ) - .merge(); + let events = (external_events, dora_events, watchdog_interval).merge(); daemon.run_inner(events).await } @@ -176,21 +151,10 @@ impl Daemon { mut self, incoming_events: impl Stream + Unpin, ) -> eyre::Result<()> { - let (node_events_tx, node_events_rx) = mpsc::channel(10); - let node_events = ReceiverStream::new(node_events_rx); - - let mut events = (incoming_events, node_events).merge(); + let mut events = incoming_events; while let Some(event) = events.next().await { match event { - Event::NewConnection(connection) => { - connection.set_nodelay(true)?; - let events_tx = node_events_tx.clone(); - tokio::spawn(listener::handle_connection(connection, events_tx)); - } - Event::ConnectError(err) => { - tracing::warn!("{:?}", err.wrap_err("failed to connect")); - } Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { let (reply, status) = self.handle_coordinator_event(event).await; let _ = reply_tx.send(reply); @@ -212,16 +176,18 @@ impl Daemon { RunStatus::Continue => {} RunStatus::Exit => break, }, - Event::Drop(DropEvent { token }) => { - match self.sent_out_shared_memory.remove(&token) { - Some(rc) => { - if let Ok(_shmem) = Rc::try_unwrap(rc) { - tracing::trace!( - "freeing shared memory after receiving last drop token" - ) + Event::Drop(DropEvent { tokens }) => { + for token in tokens { + match self.sent_out_shared_memory.remove(&token) { + Some(rc) => { + if let Ok(_shmem) = Rc::try_unwrap(rc) { + tracing::trace!( + "freeing shared memory after receiving last drop token" + ) + } } + None => tracing::warn!("received unknown drop token {token:?}"), } - None => tracing::warn!("received unknown drop token {token:?}"), } } Event::WatchdogInterval => { @@ -326,7 +292,7 @@ impl Daemon { } } - spawn::spawn_node(dataflow_id, params, self.port, self.events_tx.clone()) + spawn::spawn_node(dataflow_id, params, self.events_tx.clone()) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; } @@ -655,8 +621,6 @@ type InputId = (NodeId, DataId); #[derive(Debug)] pub enum Event { - NewConnection(TcpStream), - ConnectError(eyre::Report), Node { dataflow_id: DataflowId, node_id: NodeId, diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index f252daea..64fada45 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -1,154 +1,15 @@ -use crate::{ - tcp_utils::{tcp_receive, tcp_send}, - DaemonNodeEvent, Event, -}; +use crate::{DaemonNodeEvent, Event}; use dora_core::{ daemon_messages::{self, DropEvent}, shared_memory::ShmemServer, }; -use eyre::{eyre, Context}; -use std::{io::ErrorKind, net::Ipv4Addr}; -use tokio::{ - net::{TcpListener, TcpStream}, - sync::{mpsc, oneshot}, -}; -use tokio_stream::StreamExt; - -pub async fn create_listener() -> eyre::Result { - let localhost = Ipv4Addr::new(127, 0, 0, 1); - let socket = match TcpListener::bind((localhost, 0)).await { - Ok(socket) => socket, - Err(err) => { - return Err(eyre::Report::new(err).wrap_err("failed to create local TCP listener")) - } - }; - Ok(socket) -} - -pub async fn handle_connection(mut connection: TcpStream, events_tx: mpsc::Sender) { - let mut id = None; - let mut enter_subscribe_loop = None; - loop { - // receive the next message and parse it - let raw = match tcp_receive(&mut connection).await { - Ok(data) => data, - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { - break; - } - Err(err) => { - tracing::error!("{err:?}"); - continue; - } - }; - let message: daemon_messages::ControlRequest = - match serde_json::from_slice(&raw).wrap_err("failed to deserialize node message") { - Ok(e) => e, - Err(err) => { - tracing::warn!("{err:?}"); - continue; - } - }; - - // handle the message and translate it to a NodeEvent - let node_event = match message { - daemon_messages::ControlRequest::Register { - dataflow_id, - node_id, - } => { - id = Some((dataflow_id, node_id)); - - let reply = daemon_messages::ControlReply::Result(Ok(())); - let serialized = serde_json::to_vec(&reply) - .wrap_err("failed to serialize register result") - .unwrap(); - - match tcp_send(&mut connection, &serialized).await { - Ok(()) => continue, // don't trigger an event for register calls - Err(err) => { - tracing::warn!("{err:?}"); - break; // close connection - } - } - } - daemon_messages::ControlRequest::Stopped => DaemonNodeEvent::Stopped, - daemon_messages::ControlRequest::PrepareOutputMessage { - output_id, - metadata, - data_len, - } => DaemonNodeEvent::PrepareOutputMessage { - output_id, - metadata, - data_len, - }, - daemon_messages::ControlRequest::SendOutMessage { id } => { - DaemonNodeEvent::SendOutMessage { id } - } - daemon_messages::ControlRequest::Subscribe { - dataflow_id, - node_id, - } => { - let (tx, rx) = flume::bounded(10); - - id = Some((dataflow_id, node_id)); - enter_subscribe_loop = Some(rx); - - DaemonNodeEvent::Subscribe { event_sender: tx } - } - }; - - let (dataflow_id, node_id) = match &id { - Some(id) => id.clone(), - None => { - tracing::warn!( - "Ignoring node event because no register \ - message was sent yet: {node_event:?}" - ); - continue; - } - }; - - // send NodeEvent to daemon main loop - let (reply_tx, reply) = oneshot::channel(); - let event = Event::Node { - dataflow_id, - node_id, - event: node_event, - reply_sender: reply_tx, - }; - let Ok(()) = events_tx.send(event).await else { - break; - }; - - // wait for reply and send it out - let Ok(reply) = reply.await else { - break; // main loop exited - }; - let Ok(serialized) = serde_json::to_vec(&reply) else { - tracing::error!("failed to serialize reply"); - continue; - }; - match tcp_send(&mut connection, &serialized).await { - Ok(()) => {} - Err(err) if err.kind() == ErrorKind::UnexpectedEof => { - break; - } - Err(err) => { - tracing::error!("{err:?}"); - } - } - - // enter subscribe loop after receiving a subscribe message - if let Some(events) = enter_subscribe_loop { - subscribe_loop(connection, events, events_tx).await; - break; // the subscribe loop only exits when the connection was closed - } - } -} +use eyre::Context; +use tokio::sync::{mpsc, oneshot}; #[tracing::instrument(skip(channel, events_tx))] pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { let mut id = None; - let mut enter_subscribe_loop = None; + let mut events = None; loop { // receive the next message let message = match channel.listen().wrap_err("failed to receive node message") { @@ -201,10 +62,38 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { let (tx, rx) = flume::bounded(10); id = Some((dataflow_id, node_id)); - enter_subscribe_loop = Some(rx); + events = Some(rx); DaemonNodeEvent::Subscribe { event_sender: tx } } + daemon_messages::ControlRequest::NextEvent { drop_tokens } => { + let drop_event = Event::Drop(DropEvent { + tokens: drop_tokens, + }); + if events_tx.blocking_send(drop_event).is_err() { + break; + } + + let Some(events) = events.as_mut() else { + tracing::warn!( + "Ignoring event request because no subscribe \ + message was sent yet" + ); + continue; + }; + + let event = match events.recv() { + Ok(event) => event, + Err(_) => break, + }; + + if let Err(err) = channel.send_reply(&event).wrap_err("failed to send reply") { + tracing::error!("{err:?}"); + break; + } + + continue; // don't trigger an event (apart from the drop event sent above) + } }; let (dataflow_id, node_id) = match &id { @@ -238,64 +127,5 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { tracing::error!("{err:?}"); break; } - - // enter subscribe loop after receiving a subscribe message - if let Some(events) = enter_subscribe_loop { - todo!() - // subscribe_loop(connection, events, events_tx).await; - // break; // the subscribe loop only exits when the connection was closed - } - } -} - -async fn subscribe_loop( - connection: TcpStream, - events: flume::Receiver, - events_tx: mpsc::Sender, -) { - let (mut rx, mut tx) = connection.into_split(); - - tokio::spawn(async move { - loop { - let Ok(raw) = tcp_receive(&mut rx).await else { - break; - }; - - let event: DropEvent = match serde_json::from_slice(&raw) { - Ok(e) => e, - Err(err) => { - tracing::error!("Failed to parse incoming message: {err}"); - continue; - } - }; - if events_tx.send(Event::Drop(event)).await.is_err() { - break; - } - } - }); - - while let Some(event) = events.stream().next().await { - let message = match serde_json::to_vec(&event) { - Ok(m) => m, - Err(err) => { - let err = eyre!(err).wrap_err("failed to serialize node event"); - tracing::warn!("{err:?}"); - continue; - } - }; - match tcp_send(&mut tx, &message).await { - Ok(()) => {} - Err(err) - if err.kind() == ErrorKind::UnexpectedEof - || err.kind() == ErrorKind::BrokenPipe - || err.kind() == ErrorKind::ConnectionReset => - { - break; - } - Err(err) => { - tracing::error!("{err:?}"); - break; - } - } } } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index cbf7bed6..0438ae1d 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -14,7 +14,6 @@ use tokio::sync::mpsc; pub async fn spawn_node( dataflow_id: DataflowId, params: SpawnNodeParams, - daemon_port: u16, events_tx: mpsc::Sender, ) -> eyre::Result<()> { let SpawnNodeParams { @@ -39,6 +38,10 @@ pub async fn spawn_node( .wrap_err_with(|| format!("failed to resolve node source `{}`", node.source))? }; + let daemon_control_region = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to allocate daemon_control_region")?; let daemon_events_region = ShmemConf::new() .size(4096) .create() @@ -47,14 +50,18 @@ pub async fn spawn_node( dataflow_id, node_id: node_id.clone(), run_config: node.run_config.clone(), - daemon_port, + daemon_control_region_id: daemon_control_region.get_os_id().to_owned(), daemon_events_region_id: daemon_events_region.get_os_id().to_owned(), }; - let channel = unsafe { ShmemServer::new(daemon_events_region) } - .wrap_err("failed to create ShmemChannel")?; + let control_channel = unsafe { ShmemServer::new(daemon_control_region) } + .wrap_err("failed to create control_channel")?; + let events_channel = unsafe { ShmemServer::new(daemon_events_region) } + .wrap_err("failed to create events_channel")?; + let events_tx_cloned = events_tx.clone(); let result_tx = events_tx.clone(); - tokio::task::spawn_blocking(move || listener_loop(channel, events_tx)); + tokio::task::spawn_blocking(move || listener_loop(control_channel, events_tx)); + tokio::task::spawn_blocking(move || listener_loop(events_channel, events_tx_cloned)); let mut command = tokio::process::Command::new(&resolved_path); if let Some(args) = &node.args { diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 1bf70f7d..27a19647 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -13,7 +13,7 @@ pub struct NodeConfig { pub dataflow_id: DataflowId, pub node_id: NodeId, pub run_config: NodeRunConfig, - pub daemon_port: u16, + pub daemon_control_region_id: SharedMemoryId, pub daemon_events_region_id: SharedMemoryId, } @@ -36,6 +36,9 @@ pub enum ControlRequest { id: SharedMemoryId, }, Stopped, + NextEvent { + drop_tokens: Vec, + }, } impl ControlRequest { @@ -81,7 +84,7 @@ pub enum NodeEvent { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct DropEvent { - pub token: DropToken, + pub tokens: Vec, } #[derive( From 5b12ba8428e99db226833143600b6680efef273d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 19:42:16 +0100 Subject: [PATCH 089/225] Panic when shm server is dropped before client --- libraries/core/src/shared_memory/channel.rs | 23 +++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/libraries/core/src/shared_memory/channel.rs b/libraries/core/src/shared_memory/channel.rs index 5fbc1d43..3cc10d33 100644 --- a/libraries/core/src/shared_memory/channel.rs +++ b/libraries/core/src/shared_memory/channel.rs @@ -188,16 +188,21 @@ unsafe impl Send for ShmemChannel {} impl Drop for ShmemChannel { fn drop(&mut self) { - self.disconnect() - .store(true, std::sync::atomic::Ordering::Release); - // wake up other end - let event = if self.server { - &self.client_event + if self.server { + // server must only exit after client is disconnected + let disconnected = self.disconnect().load(std::sync::atomic::Ordering::Acquire); + tracing::debug!("closing ShmemServer after client disconnect ({disconnected})"); + assert!(disconnected); } else { - &self.server_event - }; - if let Err(err) = event.set(EventState::Signaled) { - tracing::warn!("failed to signal ShmemChannel disconnect: {err}"); + tracing::debug!("disconnecting client"); + + self.disconnect() + .store(true, std::sync::atomic::Ordering::Release); + + // wake up server + if let Err(err) = self.server_event.set(EventState::Signaled) { + tracing::warn!("failed to signal ShmemChannel disconnect: {err}"); + } } } } From 3aa50d2209d5e4563fdcdbf580caf06d9ff52867 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 19:44:36 +0100 Subject: [PATCH 090/225] Assert that message length is never 0 --- libraries/core/src/shared_memory/channel.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/libraries/core/src/shared_memory/channel.rs b/libraries/core/src/shared_memory/channel.rs index 3cc10d33..30c71e02 100644 --- a/libraries/core/src/shared_memory/channel.rs +++ b/libraries/core/src/shared_memory/channel.rs @@ -138,6 +138,7 @@ impl ShmemChannel { // then read len for synchronization let msg_len = self.data_len().load(std::sync::atomic::Ordering::Acquire) as usize; + assert_ne!(msg_len, 0); assert!(msg_len < self.memory.len() - self.data_offset); // finally read the data From 827fb1b241b63fe553a6b48569b196521036ad1a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 19:49:35 +0100 Subject: [PATCH 091/225] Fix: Don't use timeout on EventStream client --- apis/rust/node/src/daemon.rs | 10 ++++++---- libraries/core/src/shared_memory/channel.rs | 22 ++++++++++++--------- libraries/core/src/shared_memory/mod.rs | 17 +++++++++++----- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index af60ec83..502b7239 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -48,8 +48,9 @@ impl ControlChannel { .os_id(daemon_control_region_id) .open() .wrap_err("failed to connect to dora-daemon")?; - let mut channel = unsafe { ShmemClient::new(daemon_events_region) } - .wrap_err("failed to create ShmemChannel")?; + let mut channel = + unsafe { ShmemClient::new(daemon_events_region, Some(Duration::from_secs(5))) } + .wrap_err("failed to create ShmemChannel")?; let msg = ControlRequest::Register { dataflow_id, @@ -136,8 +137,9 @@ impl EventStream { .os_id(daemon_events_region_id) .open() .wrap_err("failed to connect to dora-daemon")?; - let mut channel = unsafe { ShmemClient::new(daemon_events_region) } - .wrap_err("failed to create ShmemChannel")?; + let mut channel = + unsafe { ShmemClient::new(daemon_events_region, None) } + .wrap_err("failed to create ShmemChannel")?; channel .request(&ControlRequest::Subscribe { diff --git a/libraries/core/src/shared_memory/channel.rs b/libraries/core/src/shared_memory/channel.rs index 30c71e02..93d20a8a 100644 --- a/libraries/core/src/shared_memory/channel.rs +++ b/libraries/core/src/shared_memory/channel.rs @@ -113,26 +113,30 @@ impl ShmemChannel { Ok(()) } - pub fn receive(&mut self) -> eyre::Result> + pub fn receive(&mut self, timeout: Option) -> eyre::Result> where T: for<'a> Deserialize<'a> + std::fmt::Debug, { // wait for event - let (event, timeout) = if self.server { - (&self.server_event, raw_sync::Timeout::Infinite) + let event = if self.server { + &self.server_event } else { - ( - &self.client_event, - raw_sync::Timeout::Val(Duration::from_secs(5)), - ) + &self.client_event }; - + let timeout = timeout + .map(raw_sync::Timeout::Val) + .unwrap_or(raw_sync::Timeout::Infinite); event .wait(timeout) - .map_err(|err| eyre!("failed to wait for reply from ShmemChannel: {err}"))?; + .map_err(|err| eyre!("failed to receive from ShmemChannel: {err}"))?; // check for disconnect first if self.disconnect().load(std::sync::atomic::Ordering::Acquire) { + if self.server { + tracing::trace!("shm client disconnected"); + } else { + tracing::error!("shm server disconnected"); + } return Ok(None); } diff --git a/libraries/core/src/shared_memory/mod.rs b/libraries/core/src/shared_memory/mod.rs index a5b87c7f..5a06c516 100644 --- a/libraries/core/src/shared_memory/mod.rs +++ b/libraries/core/src/shared_memory/mod.rs @@ -1,5 +1,7 @@ +use std::time::Duration; + use self::channel::ShmemChannel; -use eyre::eyre; +use eyre::{eyre, Context}; use serde::{Deserialize, Serialize}; use shared_memory::Shmem; @@ -23,7 +25,7 @@ impl ShmemServer { T: for<'a> Deserialize<'a> + std::fmt::Debug, { assert!(!self.reply_expected); - let result = self.channel.receive(); + let result = self.channel.receive(None); if matches!(result, Ok(Some(_))) { self.reply_expected = true; } @@ -44,12 +46,14 @@ impl ShmemServer { pub struct ShmemClient { channel: ShmemChannel, + timeout: Option, } impl ShmemClient { - pub unsafe fn new(memory: Shmem) -> eyre::Result { + pub unsafe fn new(memory: Shmem, timeout: Option) -> eyre::Result { Ok(Self { channel: ShmemChannel::new_client(memory)?, + timeout, }) } @@ -58,9 +62,12 @@ impl ShmemClient { T: Serialize + std::fmt::Debug, U: for<'a> Deserialize<'a> + std::fmt::Debug, { - self.channel.send(value)?; self.channel - .receive()? + .send(value) + .wrap_err("failed to send request")?; + self.channel + .receive(self.timeout) + .wrap_err("failed to receive reply")? .ok_or_else(|| eyre!("server disconnected unexpectedly")) } } From b7f47c3d99be62680f6ac5dc23c055f0e5f0f566 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:08:27 +0100 Subject: [PATCH 092/225] Log an error instead of panicking when ShmemServer is dropped before client --- libraries/core/src/shared_memory/channel.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libraries/core/src/shared_memory/channel.rs b/libraries/core/src/shared_memory/channel.rs index 93d20a8a..10643122 100644 --- a/libraries/core/src/shared_memory/channel.rs +++ b/libraries/core/src/shared_memory/channel.rs @@ -196,8 +196,11 @@ impl Drop for ShmemChannel { if self.server { // server must only exit after client is disconnected let disconnected = self.disconnect().load(std::sync::atomic::Ordering::Acquire); - tracing::debug!("closing ShmemServer after client disconnect ({disconnected})"); - assert!(disconnected); + if disconnected { + tracing::debug!("closing ShmemServer after client disconnect"); + } else { + tracing::error!("ShmemServer closed before client disconnect"); + } } else { tracing::debug!("disconnecting client"); From 5eddf1cae96ae32297a2e35ff9335731c667a287 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:10:16 +0100 Subject: [PATCH 093/225] Remove uneeded serialization methods --- libraries/core/src/daemon_messages.rs | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 27a19647..b7fe0c2b 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -41,16 +41,6 @@ pub enum ControlRequest { }, } -impl ControlRequest { - pub fn serialize(&self) -> Vec { - bincode::serialize(self).unwrap() - } - - pub fn deserialize(data: &[u8]) -> eyre::Result { - bincode::deserialize(data).wrap_err("failed to deserialize ControlRequest") - } -} - type SharedMemoryId = String; #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -59,16 +49,6 @@ pub enum ControlReply { PreparedMessage { shared_memory_id: SharedMemoryId }, } -impl ControlReply { - pub fn serialize(&self) -> Vec { - bincode::serialize(self).unwrap() - } - - pub fn deserialize(data: &[u8]) -> eyre::Result { - bincode::deserialize(data).wrap_err("failed to deserialize ControlReply") - } -} - #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum NodeEvent { Stop, From c9e8d8dfe4d3b48a034a60c5f82eee5300a9ea66 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:13:28 +0100 Subject: [PATCH 094/225] Join event stream thread before exiting --- apis/rust/node/src/daemon.rs | 17 ++++++++++------- apis/rust/node/src/lib.rs | 16 ++++++++++++++-- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 502b7239..0c2f09e1 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -6,15 +6,16 @@ use dora_core::{ use dora_message::Metadata; use eyre::{bail, eyre, Context}; use shared_memory::{Shmem, ShmemConf}; -use std::{marker::PhantomData, time::Duration}; +use std::{marker::PhantomData, thread::JoinHandle, time::Duration}; pub struct DaemonConnection { pub control_channel: ControlChannel, pub event_stream: EventStream, + pub(crate) event_stream_thread: JoinHandle<()>, } impl DaemonConnection { - pub fn init( + pub(crate) fn init( dataflow_id: DataflowId, node_id: &NodeId, daemon_control_region_id: &str, @@ -23,12 +24,14 @@ impl DaemonConnection { let control_channel = ControlChannel::init(dataflow_id, node_id, daemon_control_region_id) .wrap_err("failed to init control stream")?; - let event_stream = EventStream::init(dataflow_id, node_id, daemon_events_region_id) - .wrap_err("failed to init event stream")?; + let (event_stream, event_stream_thread) = + EventStream::init(dataflow_id, node_id, daemon_events_region_id) + .wrap_err("failed to init event stream")?; Ok(Self { control_channel, event_stream, + event_stream_thread, }) } } @@ -132,7 +135,7 @@ impl EventStream { dataflow_id: DataflowId, node_id: &NodeId, daemon_events_region_id: &str, - ) -> eyre::Result { + ) -> eyre::Result<(Self, JoinHandle<()>)> { let daemon_events_region = ShmemConf::new() .os_id(daemon_events_region_id) .open() @@ -151,7 +154,7 @@ impl EventStream { let (tx, rx) = flume::bounded(1); let mut drop_tokens = Vec::new(); - std::thread::spawn(move || loop { + let thread = std::thread::spawn(move || loop { let event: NodeEvent = match channel.request(&ControlRequest::NextEvent { drop_tokens: std::mem::take(&mut drop_tokens), }) { @@ -193,7 +196,7 @@ impl EventStream { } }); - Ok(EventStream { receiver: rx }) + Ok((EventStream { receiver: rx }, thread)) } pub fn recv(&mut self) -> Option { diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 46b3c1e4..3ce43f6e 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,3 +1,5 @@ +use std::thread::JoinHandle; + use daemon::{ControlChannel, DaemonConnection, EventStream}; pub use dora_core; use dora_core::{ @@ -16,6 +18,7 @@ pub struct DoraNode { node_config: NodeRunConfig, control_channel: ControlChannel, hlc: uhlc::HLC, + event_stream_thread: Option>, } impl DoraNode { @@ -43,6 +46,7 @@ impl DoraNode { let DaemonConnection { control_channel, event_stream, + event_stream_thread, } = DaemonConnection::init( dataflow_id, &node_id, @@ -56,6 +60,7 @@ impl DoraNode { node_config: run_config, control_channel, hlc: uhlc::HLC::default(), + event_stream_thread: Some(event_stream_thread), }; Ok((node, event_stream)) } @@ -111,8 +116,15 @@ impl DoraNode { impl Drop for DoraNode { #[tracing::instrument(skip(self), fields(self.id = %self.id))] fn drop(&mut self) { - if let Err(err) = self.control_channel.report_stop() { - tracing::error!("{err:?}"); + match self.control_channel.report_stop() { + Ok(()) => { + if let Some(thread) = self.event_stream_thread.take() { + if let Err(panic) = thread.join() { + std::panic::resume_unwind(panic); + } + } + } + Err(err) => tracing::error!("{err:?}"), } } } From c6d864b20f2e75a25679a9d2bd66ad79c39791aa Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:17:18 +0100 Subject: [PATCH 095/225] Make shmem server/client typed and report event stream closure --- apis/rust/node/src/daemon.rs | 42 +++++++++++------- binaries/daemon/src/lib.rs | 14 +++--- binaries/daemon/src/listener.rs | 58 +++++++++++++------------ libraries/core/src/daemon_messages.rs | 7 +-- libraries/core/src/shared_memory/mod.rs | 24 +++++----- 5 files changed, 81 insertions(+), 64 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 0c2f09e1..ed2099c9 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,6 +1,6 @@ use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{ControlRequest, DataflowId, NodeEvent}, + daemon_messages::{DaemonReply, DaemonRequest, DataflowId, NodeEvent}, shared_memory::ShmemClient, }; use dora_message::Metadata; @@ -37,7 +37,7 @@ impl DaemonConnection { } pub struct ControlChannel { - channel: ShmemClient, + channel: ShmemClient, } impl ControlChannel { @@ -55,7 +55,7 @@ impl ControlChannel { unsafe { ShmemClient::new(daemon_events_region, Some(Duration::from_secs(5))) } .wrap_err("failed to create ShmemChannel")?; - let msg = ControlRequest::Register { + let msg = DaemonRequest::Register { dataflow_id, node_id: node_id.clone(), }; @@ -64,7 +64,7 @@ impl ControlChannel { .wrap_err("failed to send register request to dora-daemon")?; match reply { - dora_core::daemon_messages::ControlReply::Result(result) => result + dora_core::daemon_messages::DaemonReply::Result(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to register node with dora-daemon")?, other => bail!("unexpected register reply: {other:?}"), @@ -76,10 +76,10 @@ impl ControlChannel { pub fn report_stop(&mut self) -> eyre::Result<()> { let reply = self .channel - .request(&ControlRequest::Stopped) + .request(&DaemonRequest::Stopped) .wrap_err("failed to report stopped to dora-daemon")?; match reply { - dora_core::daemon_messages::ControlReply::Result(result) => result + dora_core::daemon_messages::DaemonReply::Result(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to report stop event to dora-daemon")?, other => bail!("unexpected stopped reply: {other:?}"), @@ -95,17 +95,17 @@ impl ControlChannel { ) -> eyre::Result { let reply = self .channel - .request(&ControlRequest::PrepareOutputMessage { + .request(&DaemonRequest::PrepareOutputMessage { output_id, metadata, data_len, }) .wrap_err("failed to send PrepareOutputMessage request to dora-daemon")?; match reply { - dora_core::daemon_messages::ControlReply::PreparedMessage { + dora_core::daemon_messages::DaemonReply::PreparedMessage { shared_memory_id: id, } => Ok(MessageSample { id }), - dora_core::daemon_messages::ControlReply::Result(Err(err)) => { + dora_core::daemon_messages::DaemonReply::Result(Err(err)) => { Err(eyre!(err).wrap_err("failed to report stop event to dora-daemon")) } other => bail!("unexpected PrepareOutputMessage reply: {other:?}"), @@ -115,10 +115,10 @@ impl ControlChannel { pub fn send_message(&mut self, sample: MessageSample) -> eyre::Result<()> { let reply = self .channel - .request(&ControlRequest::SendOutMessage { id: sample.id }) + .request(&DaemonRequest::SendOutMessage { id: sample.id }) .wrap_err("failed to send SendOutMessage request to dora-daemon")?; match reply { - dora_core::daemon_messages::ControlReply::Result(result) => { + dora_core::daemon_messages::DaemonReply::Result(result) => { result.map_err(|err| eyre!(err)) } other => bail!("unexpected SendOutMessage reply: {other:?}"), @@ -140,12 +140,12 @@ impl EventStream { .os_id(daemon_events_region_id) .open() .wrap_err("failed to connect to dora-daemon")?; - let mut channel = + let mut channel: ShmemClient = unsafe { ShmemClient::new(daemon_events_region, None) } .wrap_err("failed to create ShmemChannel")?; channel - .request(&ControlRequest::Subscribe { + .request(&DaemonRequest::Subscribe { dataflow_id, node_id: node_id.clone(), }) @@ -155,10 +155,20 @@ impl EventStream { let (tx, rx) = flume::bounded(1); let mut drop_tokens = Vec::new(); let thread = std::thread::spawn(move || loop { - let event: NodeEvent = match channel.request(&ControlRequest::NextEvent { + let daemon_request = DaemonRequest::NextEvent { drop_tokens: std::mem::take(&mut drop_tokens), - }) { - Ok(event) => event, + }; + let event: NodeEvent = match channel.request(&daemon_request) { + Ok(DaemonReply::NodeEvent(event)) => event, + Ok(DaemonReply::Closed) => { + tracing::debug!("Event stream closed"); + break; + } + Ok(other) => { + let err = eyre!("unexpected control reply: {other:?}"); + tracing::warn!("{err:?}"); + continue; + } Err(err) => { let err = eyre!(err).wrap_err("failed to receive incoming event"); tracing::warn!("{err:?}"); diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index a1a79014..485916f5 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -3,7 +3,7 @@ use dora_core::{ config::{DataId, InputMapping, NodeId}, coordinator_messages::DaemonEvent, daemon_messages::{ - self, ControlReply, DaemonCoordinatorEvent, DaemonCoordinatorReply, DataflowId, DropEvent, + self, DaemonCoordinatorEvent, DaemonCoordinatorReply, DaemonReply, DataflowId, DropEvent, DropToken, SpawnDataflowNodes, SpawnNodeParams, }, descriptor::{CoreNodeKind, Descriptor}, @@ -329,7 +329,7 @@ impl Daemon { event: DaemonNodeEvent, dataflow_id: DataflowId, node_id: NodeId, - reply_sender: oneshot::Sender, + reply_sender: oneshot::Sender, ) -> eyre::Result<()> { match event { DaemonNodeEvent::Subscribe { event_sender } => { @@ -342,7 +342,7 @@ impl Daemon { "subscribe failed: no running dataflow with ID `{dataflow_id}`" )), }; - let _ = reply_sender.send(ControlReply::Result(result)); + let _ = reply_sender.send(DaemonReply::Result(result)); } DaemonNodeEvent::PrepareOutputMessage { output_id, @@ -370,7 +370,7 @@ impl Daemon { }; self.prepared_messages.insert(id.clone(), message); - let reply = ControlReply::PreparedMessage { + let reply = DaemonReply::PreparedMessage { shared_memory_id: id.clone(), }; if reply_sender.send(reply).is_err() { @@ -444,12 +444,12 @@ impl Daemon { let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), *len); } - let _ = reply_sender.send(ControlReply::Result(Ok(()))); + let _ = reply_sender.send(DaemonReply::Result(Ok(()))); } DaemonNodeEvent::Stopped => { tracing::info!("Stopped: {dataflow_id}/{node_id}"); - let _ = reply_sender.send(ControlReply::Result(Ok(()))); + let _ = reply_sender.send(DaemonReply::Result(Ok(()))); // notify downstream nodes let dataflow = self @@ -625,7 +625,7 @@ pub enum Event { dataflow_id: DataflowId, node_id: NodeId, event: DaemonNodeEvent, - reply_sender: oneshot::Sender, + reply_sender: oneshot::Sender, }, Coordinator(CoordinatorEvent), Dora(DoraEvent), diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 64fada45..c18bdd03 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -1,21 +1,24 @@ use crate::{DaemonNodeEvent, Event}; use dora_core::{ - daemon_messages::{self, DropEvent}, + daemon_messages::{DaemonReply, DaemonRequest, DropEvent}, shared_memory::ShmemServer, }; use eyre::Context; use tokio::sync::{mpsc, oneshot}; -#[tracing::instrument(skip(channel, events_tx))] -pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { +#[tracing::instrument(skip(server, events_tx))] +pub fn listener_loop( + mut server: ShmemServer, + events_tx: mpsc::Sender, +) { let mut id = None; let mut events = None; loop { // receive the next message - let message = match channel.listen().wrap_err("failed to receive node message") { + let message = match server.listen().wrap_err("failed to receive DaemonRequest") { Ok(Some(m)) => m, Ok(None) => { - tracing::info!("control channel disconnected: {id:?}"); + tracing::info!("channel disconnected: {id:?}"); break; } // disconnected Err(err) => { @@ -26,15 +29,15 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { // handle the message and translate it to a NodeEvent let node_event = match message { - daemon_messages::ControlRequest::Register { + DaemonRequest::Register { dataflow_id, node_id, } => { id = Some((dataflow_id, node_id)); - let reply = daemon_messages::ControlReply::Result(Ok(())); + let reply = DaemonReply::Result(Ok(())); - match channel.send_reply(&reply) { + match server.send_reply(&reply) { Ok(()) => continue, // don't trigger an event for register calls Err(err) => { tracing::warn!("{err:?}"); @@ -42,8 +45,8 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { } } } - daemon_messages::ControlRequest::Stopped => DaemonNodeEvent::Stopped, - daemon_messages::ControlRequest::PrepareOutputMessage { + DaemonRequest::Stopped => DaemonNodeEvent::Stopped, + DaemonRequest::PrepareOutputMessage { output_id, metadata, data_len, @@ -52,10 +55,8 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { metadata, data_len, }, - daemon_messages::ControlRequest::SendOutMessage { id } => { - DaemonNodeEvent::SendOutMessage { id } - } - daemon_messages::ControlRequest::Subscribe { + DaemonRequest::SendOutMessage { id } => DaemonNodeEvent::SendOutMessage { id }, + DaemonRequest::Subscribe { dataflow_id, node_id, } => { @@ -66,28 +67,29 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { DaemonNodeEvent::Subscribe { event_sender: tx } } - daemon_messages::ControlRequest::NextEvent { drop_tokens } => { + DaemonRequest::NextEvent { drop_tokens } => { let drop_event = Event::Drop(DropEvent { tokens: drop_tokens, }); if events_tx.blocking_send(drop_event).is_err() { - break; - } - - let Some(events) = events.as_mut() else { tracing::warn!( - "Ignoring event request because no subscribe \ - message was sent yet" + "`events_tx` was closed unexpectedly when trying to send drop tokens" ); - continue; - }; + } - let event = match events.recv() { - Ok(event) => event, - Err(_) => break, + let reply = match events.as_mut() { + Some(events) => match events.recv() { + Ok(event) => DaemonReply::NodeEvent(event), + Err(flume::RecvError::Disconnected) => DaemonReply::Closed, + }, + None => { + DaemonReply::Result(Err("Ignoring event request because no subscribe \ + message was sent yet" + .into())) + } }; - if let Err(err) = channel.send_reply(&event).wrap_err("failed to send reply") { + if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { tracing::error!("{err:?}"); break; } @@ -123,7 +125,7 @@ pub fn listener_loop(mut channel: ShmemServer, events_tx: mpsc::Sender) { let Ok(reply) = reply.blocking_recv() else { break; // main loop exited }; - if let Err(err) = channel.send_reply(&reply).wrap_err("failed to send reply") { + if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { tracing::error!("{err:?}"); break; } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index b7fe0c2b..a2845042 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -5,7 +5,6 @@ use crate::{ descriptor, }; use dora_message::Metadata; -use eyre::Context; use uuid::Uuid; #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -18,7 +17,7 @@ pub struct NodeConfig { } #[derive(Debug, serde::Serialize, serde::Deserialize)] -pub enum ControlRequest { +pub enum DaemonRequest { Register { dataflow_id: DataflowId, node_id: NodeId, @@ -44,9 +43,11 @@ pub enum ControlRequest { type SharedMemoryId = String; #[derive(Debug, serde::Serialize, serde::Deserialize)] -pub enum ControlReply { +pub enum DaemonReply { Result(Result<(), String>), PreparedMessage { shared_memory_id: SharedMemoryId }, + Closed, + NodeEvent(NodeEvent), } #[derive(Debug, serde::Serialize, serde::Deserialize)] diff --git a/libraries/core/src/shared_memory/mod.rs b/libraries/core/src/shared_memory/mod.rs index 5a06c516..d230f535 100644 --- a/libraries/core/src/shared_memory/mod.rs +++ b/libraries/core/src/shared_memory/mod.rs @@ -1,26 +1,28 @@ -use std::time::Duration; - use self::channel::ShmemChannel; use eyre::{eyre, Context}; use serde::{Deserialize, Serialize}; use shared_memory::Shmem; +use std::marker::PhantomData; +use std::time::Duration; mod channel; -pub struct ShmemServer { +pub struct ShmemServer { channel: ShmemChannel, reply_expected: bool, + phantom: PhantomData<(T, U)>, } -impl ShmemServer { +impl ShmemServer { pub unsafe fn new(memory: Shmem) -> eyre::Result { Ok(Self { channel: ShmemChannel::new_server(memory)?, reply_expected: false, + phantom: PhantomData, }) } - pub fn listen(&mut self) -> eyre::Result> + pub fn listen(&mut self) -> eyre::Result> where T: for<'a> Deserialize<'a> + std::fmt::Debug, { @@ -33,9 +35,9 @@ impl ShmemServer { result } - pub fn send_reply(&mut self, value: &T) -> eyre::Result<()> + pub fn send_reply(&mut self, value: &U) -> eyre::Result<()> where - T: Serialize + std::fmt::Debug, + U: Serialize + std::fmt::Debug, { assert!(self.reply_expected); self.channel.send(value)?; @@ -44,20 +46,22 @@ impl ShmemServer { } } -pub struct ShmemClient { +pub struct ShmemClient { channel: ShmemChannel, timeout: Option, + phantom: PhantomData<(T, U)>, } -impl ShmemClient { +impl ShmemClient { pub unsafe fn new(memory: Shmem, timeout: Option) -> eyre::Result { Ok(Self { channel: ShmemChannel::new_client(memory)?, timeout, + phantom: PhantomData, }) } - pub fn request(&mut self, value: &T) -> eyre::Result + pub fn request(&mut self, value: &T) -> eyre::Result where T: Serialize + std::fmt::Debug, U: for<'a> Deserialize<'a> + std::fmt::Debug, From b73c872043d426e922cde73258b290c7c7b83296 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:26:01 +0100 Subject: [PATCH 096/225] Use blocks for initializing shmem severs --- binaries/daemon/src/spawn.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 0438ae1d..38b181ae 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -53,15 +53,23 @@ pub async fn spawn_node( daemon_control_region_id: daemon_control_region.get_os_id().to_owned(), daemon_events_region_id: daemon_events_region.get_os_id().to_owned(), }; - let control_channel = unsafe { ShmemServer::new(daemon_control_region) } - .wrap_err("failed to create control_channel")?; - let events_channel = unsafe { ShmemServer::new(daemon_events_region) } - .wrap_err("failed to create events_channel")?; - let events_tx_cloned = events_tx.clone(); - let result_tx = events_tx.clone(); - tokio::task::spawn_blocking(move || listener_loop(control_channel, events_tx)); - tokio::task::spawn_blocking(move || listener_loop(events_channel, events_tx_cloned)); + { + let server = unsafe { ShmemServer::new(daemon_control_region) } + .wrap_err("failed to create control server")?; + let events_tx = events_tx.clone(); + tokio::task::spawn_blocking(move || listener_loop(server, events_tx)); + } + { + let server = unsafe { ShmemServer::new(daemon_events_region) } + .wrap_err("failed to create events server")?; + let event_loop_node_id = format!("{dataflow_id}/{node_id}"); + let events_tx = events_tx.clone(); + tokio::task::spawn_blocking(move || { + listener_loop(server, events_tx); + tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); + }); + } let mut command = tokio::process::Command::new(&resolved_path); if let Some(args) = &node.args { @@ -107,7 +115,7 @@ pub async fn spawn_node( node_id: node_id_cloned, result, }; - let _ = result_tx.send(event.into()).await; + let _ = events_tx.send(event.into()).await; }); Ok(()) } From c9ab38d6bc776f49f4005bd1496d175bc960c877 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:27:52 +0100 Subject: [PATCH 097/225] Set up tracing subscribers in benchmark example --- Cargo.lock | 4 ++++ examples/benchmark/node/Cargo.toml | 2 ++ examples/benchmark/node/src/main.rs | 18 ++++++++++++++++-- examples/benchmark/sink/Cargo.toml | 2 ++ examples/benchmark/sink/src/main.rs | 15 +++++++++++++++ 5 files changed, 39 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index acc4e7b6..7d0ebed2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -281,6 +281,8 @@ dependencies = [ "futures", "rand", "tokio", + "tracing", + "tracing-subscriber", ] [[package]] @@ -289,6 +291,8 @@ version = "0.1.2" dependencies = [ "dora-node-api", "eyre", + "tracing", + "tracing-subscriber", ] [[package]] diff --git a/examples/benchmark/node/Cargo.toml b/examples/benchmark/node/Cargo.toml index 35e582a7..ccc55c4a 100644 --- a/examples/benchmark/node/Cargo.toml +++ b/examples/benchmark/node/Cargo.toml @@ -11,3 +11,5 @@ eyre = "0.6.8" futures = "0.3.21" rand = "0.8.5" tokio = { version = "1.20.1", features = ["rt", "macros"] } +tracing = "0.1.36" +tracing-subscriber = "0.3.15" diff --git a/examples/benchmark/node/src/main.rs b/examples/benchmark/node/src/main.rs index 59dab3a1..1a41c904 100644 --- a/examples/benchmark/node/src/main.rs +++ b/examples/benchmark/node/src/main.rs @@ -1,9 +1,12 @@ -use std::time::Duration; - use dora_node_api::{self, dora_core::config::DataId, DoraNode}; +use eyre::Context; use rand::Rng; +use std::time::Duration; +use tracing_subscriber::Layer; fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + let latency = DataId::from("latency".to_owned()); let throughput = DataId::from("throughput".to_owned()); @@ -56,3 +59,14 @@ fn main() -> eyre::Result<()> { Ok(()) } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(tracing::metadata::LevelFilter::DEBUG); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} diff --git a/examples/benchmark/sink/Cargo.toml b/examples/benchmark/sink/Cargo.toml index 58545c97..940361ac 100644 --- a/examples/benchmark/sink/Cargo.toml +++ b/examples/benchmark/sink/Cargo.toml @@ -8,3 +8,5 @@ edition = "2021" [dependencies] dora-node-api = { workspace = true } eyre = "0.6.8" +tracing = "0.1.36" +tracing-subscriber = "0.3.15" diff --git a/examples/benchmark/sink/src/main.rs b/examples/benchmark/sink/src/main.rs index 9fea5acb..97703852 100644 --- a/examples/benchmark/sink/src/main.rs +++ b/examples/benchmark/sink/src/main.rs @@ -1,7 +1,11 @@ use dora_node_api::{self, daemon::Event, DoraNode}; +use eyre::Context; use std::time::{Duration, Instant}; +use tracing_subscriber::Layer; fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + let (_node, mut events) = DoraNode::init_from_env()?; // latency is tested first @@ -74,3 +78,14 @@ fn record_results( }; println!("{msg}"); } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(tracing::metadata::LevelFilter::DEBUG); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} From 7b6f24c24b0d3f9f6b782dfe6ffa6a73c8811d24 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 11 Jan 2023 20:34:51 +0100 Subject: [PATCH 098/225] Confirm output sending a bit earlier --- binaries/daemon/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 485916f5..c01a804f 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -394,6 +394,8 @@ impl Daemon { format!("send out failed: no running dataflow with ID `{dataflow_id}`") })?; + let _ = reply_sender.send(DaemonReply::Result(Ok(()))); + // figure out receivers from dataflow graph let empty_set = BTreeSet::new(); let local_receivers = dataflow @@ -443,8 +445,6 @@ impl Daemon { if let Some((memory, len)) = &data { let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), *len); } - - let _ = reply_sender.send(DaemonReply::Result(Ok(()))); } DaemonNodeEvent::Stopped => { tracing::info!("Stopped: {dataflow_id}/{node_id}"); From bf978d9d7d426fb512cba71fcb14a10d2db3d2e6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jan 2023 15:26:01 +0100 Subject: [PATCH 099/225] Always send register message first, also when subscribing --- apis/rust/node/src/daemon.rs | 44 ++++++++++++++++----------- libraries/core/src/daemon_messages.rs | 5 +-- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index ed2099c9..424c3ac4 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -55,20 +55,7 @@ impl ControlChannel { unsafe { ShmemClient::new(daemon_events_region, Some(Duration::from_secs(5))) } .wrap_err("failed to create ShmemChannel")?; - let msg = DaemonRequest::Register { - dataflow_id, - node_id: node_id.clone(), - }; - let reply = channel - .request(&msg) - .wrap_err("failed to send register request to dora-daemon")?; - - match reply { - dora_core::daemon_messages::DaemonReply::Result(result) => result - .map_err(|e| eyre!(e)) - .wrap_err("failed to register node with dora-daemon")?, - other => bail!("unexpected register reply: {other:?}"), - } + register(dataflow_id, node_id.clone(), &mut channel)?; Ok(Self { channel }) } @@ -126,6 +113,28 @@ impl ControlChannel { } } +fn register( + dataflow_id: DataflowId, + node_id: NodeId, + channel: &mut ShmemClient, +) -> eyre::Result<()> { + let msg = DaemonRequest::Register { + dataflow_id, + node_id, + }; + let reply = channel + .request(&msg) + .wrap_err("failed to send register request to dora-daemon")?; + + match reply { + dora_core::daemon_messages::DaemonReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to register node with dora-daemon")?, + other => bail!("unexpected register reply: {other:?}"), + } + Ok(()) +} + pub struct EventStream { receiver: flume::Receiver<(NodeEvent, std::sync::mpsc::Sender<()>)>, } @@ -144,11 +153,10 @@ impl EventStream { unsafe { ShmemClient::new(daemon_events_region, None) } .wrap_err("failed to create ShmemChannel")?; + register(dataflow_id, node_id.clone(), &mut channel)?; + channel - .request(&DaemonRequest::Subscribe { - dataflow_id, - node_id: node_id.clone(), - }) + .request(&DaemonRequest::Subscribe) .map_err(|e| eyre!(e)) .wrap_err("failed to create subscription with dora-daemon")?; diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index a2845042..04b4a8a1 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -22,10 +22,7 @@ pub enum DaemonRequest { dataflow_id: DataflowId, node_id: NodeId, }, - Subscribe { - dataflow_id: DataflowId, - node_id: NodeId, - }, + Subscribe, PrepareOutputMessage { output_id: DataId, metadata: Metadata<'static>, From 68b46f7ba741945a19f53f7ee58be1d3635c908d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jan 2023 15:27:21 +0100 Subject: [PATCH 100/225] Move shared memory handling to separate task To reduce load on the main task. --- binaries/daemon/src/lib.rs | 317 ++++++++++++---------- binaries/daemon/src/listener.rs | 247 +++++++++++------ binaries/daemon/src/shared_mem_handler.rs | 270 ++++++++++++++++++ binaries/daemon/src/spawn.rs | 17 +- 4 files changed, 612 insertions(+), 239 deletions(-) create mode 100644 binaries/daemon/src/shared_mem_handler.rs diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index c01a804f..e0fb514c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -3,8 +3,8 @@ use dora_core::{ config::{DataId, InputMapping, NodeId}, coordinator_messages::DaemonEvent, daemon_messages::{ - self, DaemonCoordinatorEvent, DaemonCoordinatorReply, DaemonReply, DataflowId, DropEvent, - DropToken, SpawnDataflowNodes, SpawnNodeParams, + self, DaemonCoordinatorEvent, DaemonCoordinatorReply, DaemonReply, DataflowId, DropToken, + SpawnDataflowNodes, SpawnNodeParams, }, descriptor::{CoreNodeKind, Descriptor}, }; @@ -12,13 +12,13 @@ use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; use futures::{future, stream, FutureExt, TryFutureExt}; use futures_concurrency::stream::Merge; -use shared_memory::{Shmem, ShmemConf}; +use shared_mem_handler::SharedMemSample; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, + fmt, net::SocketAddr, path::Path, - rc::Rc, - time::Duration, + time::{Duration, Instant}, }; use tcp_utils::tcp_receive; use tokio::{ @@ -31,17 +31,18 @@ use uuid::Uuid; mod coordinator; mod listener; +mod shared_mem_handler; mod spawn; mod tcp_utils; pub struct Daemon { - prepared_messages: HashMap, - sent_out_shared_memory: HashMap>, - running: HashMap, events_tx: mpsc::Sender, + shared_memory_handler: flume::Sender, + shared_memory_handler_node: flume::Sender, + coordinator_addr: Option, machine_id: String, @@ -129,21 +130,37 @@ impl Daemon { exit_when_done: Option>, ) -> eyre::Result<()> { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); + let (shared_memory_handler, shared_memory_daemon_rx) = flume::unbounded(); + let (shared_memory_handler_node, shared_memory_node_rx) = flume::bounded(10); let daemon = Self { - prepared_messages: Default::default(), - sent_out_shared_memory: Default::default(), running: HashMap::new(), events_tx: dora_events_tx, + shared_memory_handler, + shared_memory_handler_node, coordinator_addr, machine_id, exit_when_done, }; + let (shmem_events_tx, shmem_events_rx) = flume::bounded(5); + tokio::spawn(async { + let mut handler = shared_mem_handler::SharedMemHandler::new(shmem_events_tx); + handler + .run(shared_memory_node_rx, shared_memory_daemon_rx) + .await; + }); let dora_events = ReceiverStream::new(dora_events_rx); + let shmem_events = shmem_events_rx.into_stream().map(Event::ShmemHandler); let watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval( Duration::from_secs(5), )) .map(|_| Event::WatchdogInterval); - let events = (external_events, dora_events, watchdog_interval).merge(); + let events = ( + external_events, + dora_events, + shmem_events, + watchdog_interval, + ) + .merge(); daemon.run_inner(events).await } @@ -154,6 +171,8 @@ impl Daemon { let mut events = incoming_events; while let Some(event) = events.next().await { + let start = Instant::now(); + let event_debug = format!("{event:?}"); match event { Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { let (reply, status) = self.handle_coordinator_event(event).await; @@ -176,20 +195,7 @@ impl Daemon { RunStatus::Continue => {} RunStatus::Exit => break, }, - Event::Drop(DropEvent { tokens }) => { - for token in tokens { - match self.sent_out_shared_memory.remove(&token) { - Some(rc) => { - if let Ok(_shmem) = Rc::try_unwrap(rc) { - tracing::trace!( - "freeing shared memory after receiving last drop token" - ) - } - } - None => tracing::warn!("received unknown drop token {token:?}"), - } - } - } + Event::ShmemHandler(event) => self.handle_shmem_handler_event(event).await?, Event::WatchdogInterval => { if let Some(addr) = self.coordinator_addr { let mut connection = coordinator::send_event( @@ -208,6 +214,11 @@ impl Daemon { } } } + + let elapsed = start.elapsed(); + // if elapsed.as_micros() > 10 { + // tracing::debug!("handled event in {elapsed:?}: {event_debug}"); + // } } Ok(()) @@ -292,9 +303,14 @@ impl Daemon { } } - spawn::spawn_node(dataflow_id, params, self.events_tx.clone()) - .await - .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; + spawn::spawn_node( + dataflow_id, + params, + self.events_tx.clone(), + self.shared_memory_handler_node.clone(), + ) + .await + .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; } for interval in dataflow.timers.keys().copied() { let events_tx = self.events_tx.clone(); @@ -344,108 +360,6 @@ impl Daemon { }; let _ = reply_sender.send(DaemonReply::Result(result)); } - DaemonNodeEvent::PrepareOutputMessage { - output_id, - metadata, - data_len, - } => { - let memory = if data_len > 0 { - Some( - ShmemConf::new() - .size(data_len) - .create() - .wrap_err("failed to allocate shared memory")?, - ) - } else { - None - }; - let id = memory - .as_ref() - .map(|m| m.get_os_id().to_owned()) - .unwrap_or_else(|| Uuid::new_v4().to_string()); - let message = PreparedMessage { - output_id, - metadata, - data: memory.map(|m| (m, data_len)), - }; - self.prepared_messages.insert(id.clone(), message); - - let reply = DaemonReply::PreparedMessage { - shared_memory_id: id.clone(), - }; - if reply_sender.send(reply).is_err() { - // free shared memory slice again - self.prepared_messages.remove(&id); - } - } - DaemonNodeEvent::SendOutMessage { id } => { - let message = self - .prepared_messages - .remove(&id) - .ok_or_else(|| eyre!("invalid shared memory id"))?; - let PreparedMessage { - output_id, - metadata, - data, - } = message; - let data = data.map(|(m, len)| (Rc::new(m), len)); - - let dataflow = self.running.get_mut(&dataflow_id).wrap_err_with(|| { - format!("send out failed: no running dataflow with ID `{dataflow_id}`") - })?; - - let _ = reply_sender.send(DaemonReply::Result(Ok(()))); - - // figure out receivers from dataflow graph - let empty_set = BTreeSet::new(); - let local_receivers = dataflow - .mappings - .get(&(node_id, output_id)) - .unwrap_or(&empty_set); - - // send shared memory ID to all local receivers - let mut closed = Vec::new(); - for (receiver_id, input_id) in local_receivers { - if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { - let drop_token = DropToken::generate(); - let send_result = channel.send_async(daemon_messages::NodeEvent::Input { - id: input_id.clone(), - metadata: metadata.clone(), - data: data.as_ref().map(|(m, len)| daemon_messages::InputData { - shared_memory_id: m.get_os_id().to_owned(), - len: *len, - drop_token: drop_token.clone(), - }), - }); - - match timeout(Duration::from_millis(10), send_result).await { - Ok(Ok(())) => { - // keep shared memory ptr in order to free it once all subscribers are done - if let Some((memory, _)) = &data { - self.sent_out_shared_memory - .insert(drop_token, memory.clone()); - } - } - Ok(Err(_)) => { - closed.push(receiver_id); - } - Err(_) => { - tracing::warn!( - "dropping input event `{receiver_id}/{input_id}` (send timeout)" - ); - } - } - } - } - for id in closed { - dataflow.subscribe_channels.remove(id); - } - - // TODO send `data` via network to all remove receivers - if let Some((memory, len)) = &data { - let data = std::ptr::slice_from_raw_parts(memory.as_ptr(), *len); - } - } DaemonNodeEvent::Stopped => { tracing::info!("Stopped: {dataflow_id}/{node_id}"); @@ -597,12 +511,95 @@ impl Daemon { } Ok(RunStatus::Continue) } -} -struct PreparedMessage { - output_id: DataId, - metadata: dora_message::Metadata<'static>, - data: Option<(Shmem, usize)>, + async fn handle_shmem_handler_event(&mut self, event: ShmemHandlerEvent) -> eyre::Result<()> { + match event { + ShmemHandlerEvent::SendOut { + dataflow_id, + node_id, + output_id, + metadata, + data, + } => { + let dataflow = self.running.get_mut(&dataflow_id).wrap_err_with(|| { + format!("send out failed: no running dataflow with ID `{dataflow_id}`") + })?; + + tracing::trace!( + "Time between prepare and send out: {:?}", + metadata + .timestamp() + .get_time() + .to_system_time() + .elapsed() + .unwrap() + ); + + // figure out receivers from dataflow graph + let empty_set = BTreeSet::new(); + let local_receivers = dataflow + .mappings + .get(&(node_id, output_id)) + .unwrap_or(&empty_set); + + // send shared memory ID to all local receivers + let mut closed = Vec::new(); + let mut drop_tokens = Vec::new(); + for (receiver_id, input_id) in local_receivers { + if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { + let drop_token = DropToken::generate(); + let send_result = channel.send_async(daemon_messages::NodeEvent::Input { + id: input_id.clone(), + metadata: metadata.clone(), + data: data.as_ref().map(|data| daemon_messages::InputData { + shared_memory_id: data.get_os_id().to_owned(), + len: data.len(), + drop_token: drop_token.clone(), + }), + }); + + match timeout(Duration::from_millis(10), send_result).await { + Ok(Ok(())) => { + drop_tokens.push(drop_token); + } + Ok(Err(_)) => { + closed.push(receiver_id); + } + Err(_) => { + tracing::warn!( + "dropping input event `{receiver_id}/{input_id}` (send timeout)" + ); + } + } + } + } + for id in closed { + dataflow.subscribe_channels.remove(id); + } + let data_bytes = data.as_ref().map(|d| unsafe { d.as_slice() }.to_owned()); + + // report drop tokens to shared memory handler + if let Some(data) = data { + if let Err(err) = self + .shared_memory_handler + .send_async(shared_mem_handler::DaemonEvent::SentOut { data, drop_tokens }) + .await + .wrap_err("shared mem handler crashed after send out") + { + tracing::error!("{err:?}"); + } + } + + // TODO send `data` via network to all remove receivers + if let Some(data) = data_bytes {} + } + ShmemHandlerEvent::HandlerError(err) => { + bail!(err.wrap_err("shared memory handler failed")) + } + } + + Ok(()) + } } #[derive(Default)] @@ -629,7 +626,7 @@ pub enum Event { }, Coordinator(CoordinatorEvent), Dora(DoraEvent), - Drop(DropEvent), + ShmemHandler(ShmemHandlerEvent), WatchdogInterval, } @@ -638,23 +635,55 @@ impl From for Event { Event::Dora(event) } } +impl From for Event { + fn from(event: ShmemHandlerEvent) -> Self { + Event::ShmemHandler(event) + } +} #[derive(Debug)] pub enum DaemonNodeEvent { - PrepareOutputMessage { - output_id: DataId, - metadata: dora_message::Metadata<'static>, - data_len: usize, - }, - SendOutMessage { - id: MessageId, - }, Stopped, Subscribe { event_sender: flume::Sender, }, } +pub enum ShmemHandlerEvent { + SendOut { + dataflow_id: DataflowId, + node_id: NodeId, + output_id: DataId, + metadata: dora_message::Metadata<'static>, + data: Option, + }, + HandlerError(eyre::ErrReport), +} + +impl fmt::Debug for ShmemHandlerEvent { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::SendOut { + dataflow_id, + node_id, + output_id, + metadata, + data, + } => f + .debug_struct("SendOut") + .field("dataflow_id", dataflow_id) + .field("node_id", node_id) + .field("output_id", output_id) + .field("metadata", metadata) + .field("data", &data.as_ref().map(|_| "Some(..)").unwrap_or("None")) + .finish(), + ShmemHandlerEvent::HandlerError(err) => { + f.debug_tuple("HandlerError").field(err).finish() + } + } + } +} + #[derive(Debug)] pub enum DoraEvent { Timer { diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index c18bdd03..c1012c1c 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -1,133 +1,204 @@ -use crate::{DaemonNodeEvent, Event}; +use crate::{shared_mem_handler, DaemonNodeEvent, Event}; use dora_core::{ - daemon_messages::{DaemonReply, DaemonRequest, DropEvent}, + config::NodeId, + daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, shared_memory::ShmemServer, }; -use eyre::Context; +use eyre::{eyre, Context}; use tokio::sync::{mpsc, oneshot}; -#[tracing::instrument(skip(server, events_tx))] +#[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] pub fn listener_loop( mut server: ShmemServer, - events_tx: mpsc::Sender, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, ) { - let mut id = None; - let mut events = None; - loop { - // receive the next message - let message = match server.listen().wrap_err("failed to receive DaemonRequest") { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!("channel disconnected: {id:?}"); - break; - } // disconnected - Err(err) => { + // receive the first message + let message = match server + .listen() + .wrap_err("failed to receive register message") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!("channel disconnected before register message"); + return; + } // disconnected + Err(err) => { + tracing::info!("{err:?}"); + return; + } + }; + + match message { + DaemonRequest::Register { + dataflow_id, + node_id, + } => { + let reply = DaemonReply::Result(Ok(())); + match server + .send_reply(&reply) + .wrap_err("failed to send register reply") + { + Ok(()) => { + let mut listener = Listener { + dataflow_id, + node_id, + server, + daemon_tx, + shmem_handler_tx, + subscribed_events: None, + }; + match listener.run().wrap_err("listener failed") { + Ok(()) => {} + Err(err) => tracing::error!("{err:?}"), + } + } + Err(err) => { + tracing::warn!("{err:?}"); + } + } + } + _ => { + let reply = DaemonReply::Result(Err("must send register message first".into())); + if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { tracing::warn!("{err:?}"); - continue; } - }; - - // handle the message and translate it to a NodeEvent - let node_event = match message { - DaemonRequest::Register { - dataflow_id, - node_id, - } => { - id = Some((dataflow_id, node_id)); + } + } +} - let reply = DaemonReply::Result(Ok(())); +struct Listener { + dataflow_id: DataflowId, + node_id: NodeId, + server: ShmemServer, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, + subscribed_events: Option>, +} - match server.send_reply(&reply) { - Ok(()) => continue, // don't trigger an event for register calls - Err(err) => { - tracing::warn!("{err:?}"); - break; // close connection - } +impl Listener { + fn run(&mut self) -> eyre::Result<()> { + loop { + // receive the next message + let message = match self + .server + .listen() + .wrap_err("failed to receive DaemonRequest") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!( + "channel disconnected: {}/{}", + self.dataflow_id, + self.node_id + ); + break; + } // disconnected + Err(err) => { + tracing::warn!("{err:?}"); + continue; } + }; + self.handle_message(message)?; + } + Ok(()) + } + + fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { + match message { + DaemonRequest::Register { .. } => { + let reply = DaemonReply::Result(Err("unexpected register message".into())); + self.send_reply(&reply)?; } - DaemonRequest::Stopped => DaemonNodeEvent::Stopped, + DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped)?, DaemonRequest::PrepareOutputMessage { output_id, metadata, data_len, - } => DaemonNodeEvent::PrepareOutputMessage { - output_id, - metadata, - data_len, - }, - DaemonRequest::SendOutMessage { id } => DaemonNodeEvent::SendOutMessage { id }, - DaemonRequest::Subscribe { - dataflow_id, - node_id, } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data_len, + reply_sender, + }; + self.send_shared_memory_event(event)?; + self.send_reply( + &reply + .blocking_recv() + .wrap_err("failed to receive prepare output reply")?, + )?; + } + DaemonRequest::SendOutMessage { id } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::SendOutMessage { id, reply_sender }; + self.send_shared_memory_event(event)?; + self.send_reply( + &reply + .blocking_recv() + .wrap_err("failed to receive send output reply")?, + )?; + } + DaemonRequest::Subscribe => { let (tx, rx) = flume::bounded(10); - - id = Some((dataflow_id, node_id)); - events = Some(rx); - - DaemonNodeEvent::Subscribe { event_sender: tx } + self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; + self.subscribed_events = Some(rx); } DaemonRequest::NextEvent { drop_tokens } => { - let drop_event = Event::Drop(DropEvent { + let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { tokens: drop_tokens, }); - if events_tx.blocking_send(drop_event).is_err() { - tracing::warn!( - "`events_tx` was closed unexpectedly when trying to send drop tokens" - ); - } + self.send_shared_memory_event(drop_event)?; - let reply = match events.as_mut() { + let reply = match self.subscribed_events.as_mut() { Some(events) => match events.recv() { Ok(event) => DaemonReply::NodeEvent(event), Err(flume::RecvError::Disconnected) => DaemonReply::Closed, }, None => { DaemonReply::Result(Err("Ignoring event request because no subscribe \ - message was sent yet" + message was sent yet" .into())) } }; - if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { - tracing::error!("{err:?}"); - break; - } - - continue; // don't trigger an event (apart from the drop event sent above) - } - }; - - let (dataflow_id, node_id) = match &id { - Some(id) => id.clone(), - None => { - tracing::warn!( - "Ignoring node event because no register \ - message was sent yet: {node_event:?}" - ); - continue; + self.send_reply(&reply)?; } - }; + } + Ok(()) + } + fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { // send NodeEvent to daemon main loop let (reply_tx, reply) = oneshot::channel(); let event = Event::Node { - dataflow_id, - node_id, - event: node_event, + dataflow_id: self.dataflow_id.clone(), + node_id: self.node_id.clone(), + event, reply_sender: reply_tx, }; - let Ok(()) = events_tx.blocking_send(event) else { - break; - }; + self.daemon_tx + .blocking_send(event) + .map_err(|_| eyre!("failed to send event to daemon"))?; + let reply = reply + .blocking_recv() + .map_err(|_| eyre!("failed to receive reply from daemon"))?; + self.send_reply(&reply)?; + Ok(()) + } - // wait for reply and send it out - let Ok(reply) = reply.blocking_recv() else { - break; // main loop exited - }; - if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { - tracing::error!("{err:?}"); - break; - } + fn send_reply(&mut self, reply: &DaemonReply) -> eyre::Result<()> { + self.server + .send_reply(&reply) + .wrap_err("failed to send reply to node") + } + + fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { + self.shmem_handler_tx + .send(event) + .map_err(|_| eyre!("failed to send event to shared_mem_handler")) } } diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs new file mode 100644 index 00000000..2a2e1bfb --- /dev/null +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -0,0 +1,270 @@ +use core::fmt; +use std::{collections::HashMap, sync::Arc, time::Instant}; + +use dora_core::{ + config::{DataId, NodeId}, + daemon_messages::{DaemonReply, DataflowId, DropEvent, DropToken}, +}; +use eyre::{eyre, Context}; +use flume::{Receiver, Sender}; +use futures::StreamExt; +use futures_concurrency::stream::Merge; +use shared_memory::{Shmem, ShmemConf}; +use tokio::sync::oneshot; +use uuid::Uuid; + +use crate::MessageId; + +pub struct SharedMemHandler { + events_tx: Sender, + prepared_messages: HashMap, + sent_out_shared_memory: HashMap>, +} + +impl SharedMemHandler { + pub fn new(events_tx: Sender) -> Self { + Self { + events_tx, + prepared_messages: HashMap::new(), + sent_out_shared_memory: HashMap::new(), + } + } + + pub async fn run( + &mut self, + node_events: Receiver, + daemon_events: Receiver, + ) { + if let Err(err) = self.run_inner(node_events, daemon_events).await { + if let Err(send_err) = self + .events_tx + .send_async(crate::ShmemHandlerEvent::HandlerError(err)) + .await + { + tracing::error!("{send_err:?}"); + } + } + } + + pub async fn run_inner( + &mut self, + node_events: Receiver, + daemon_events: Receiver, + ) -> eyre::Result<()> { + let mut events = ( + node_events.stream().map(Event::Node), + daemon_events.stream().map(Event::Daemon), + ) + .merge(); + while let Some(event) = events.next().await { + let start = Instant::now(); + let event_debug = format!("{event:?}"); + match event { + Event::Node(event) => self.handle_node_event(event).await?, + Event::Daemon(event) => self.handle_daemon_event(event).await?, + } + let elapsed = start.elapsed(); + // if elapsed.as_micros() > 10 { + // tracing::debug!("handled event in {elapsed:?}: {event_debug}"); + // } + } + Ok(()) + } + + async fn handle_node_event(&mut self, event: NodeEvent) -> eyre::Result<()> { + match event { + NodeEvent::Drop(DropEvent { tokens }) => { + for token in tokens { + match self.sent_out_shared_memory.remove(&token) { + Some(arc) => { + if let Ok(shmem) = Arc::try_unwrap(arc) { + tokio::task::spawn_blocking(move || { + tracing::trace!( + "freeing shared memory after receiving last drop token" + ); + std::mem::drop(shmem); + }); + } + } + None => tracing::warn!("received unknown drop token {token:?}"), + } + } + } + NodeEvent::PrepareOutputMessage { + dataflow_id, + node_id, + output_id, + metadata, + data_len, + reply_sender, + } => { + tracing::trace!( + "Time between construct and prepare: {:?}", + metadata + .timestamp() + .get_time() + .to_system_time() + .elapsed() + .unwrap() + ); + + let memory = if data_len > 0 { + Some(ShmemHandle( + ShmemConf::new() + .size(data_len) + .create() + .wrap_err("failed to allocate shared memory")?, + )) + } else { + None + }; + let id = memory + .as_ref() + .map(|m| m.0.get_os_id().to_owned()) + .unwrap_or_else(|| Uuid::new_v4().to_string()); + let message = PreparedMessage { + dataflow_id, + node_id, + output_id, + metadata, + data: memory.map(|m| (m, data_len)), + }; + self.prepared_messages.insert(id.clone(), message); + + let reply = DaemonReply::PreparedMessage { + shared_memory_id: id.clone(), + }; + if reply_sender.send(reply).is_err() { + // free shared memory slice again + self.prepared_messages.remove(&id); + } + } + NodeEvent::SendOutMessage { id, reply_sender } => { + let message = self + .prepared_messages + .remove(&id) + .ok_or_else(|| eyre!("invalid shared memory id"))?; + let PreparedMessage { + dataflow_id, + node_id, + output_id, + metadata, + data, + } = message; + let data = data.map(|(m, len)| SharedMemSample { + shared_memory: m, + len, + }); + + let send_result = self + .events_tx + .send_async(crate::ShmemHandlerEvent::SendOut { + dataflow_id, + node_id, + output_id, + metadata, + data, + }) + .await; + let _ = reply_sender.send(DaemonReply::Result( + send_result.map_err(|_| "daemon is no longer running".into()), + )); + } + } + Ok(()) + } + + async fn handle_daemon_event(&mut self, event: DaemonEvent) -> eyre::Result<()> { + match event { + DaemonEvent::SentOut { data, drop_tokens } => { + // keep shared memory alive until we received all drop tokens + let memory = Arc::new(data.shared_memory); + for drop_token in drop_tokens { + self.sent_out_shared_memory + .insert(drop_token, memory.clone()); + } + } + } + Ok(()) + } +} + +pub struct SharedMemSample { + shared_memory: ShmemHandle, + len: usize, +} + +impl SharedMemSample { + pub fn as_raw_slice(&self) -> *const [u8] { + std::ptr::slice_from_raw_parts(self.shared_memory.0.as_ptr(), self.len) + } + + pub unsafe fn as_slice(&self) -> &[u8] { + unsafe { &*self.as_raw_slice() } + } + + pub fn get_os_id(&self) -> &str { + self.shared_memory.0.get_os_id() + } + + pub fn len(&self) -> usize { + self.shared_memory.0.len() + } +} + +#[derive(Debug)] +enum Event { + Node(NodeEvent), + Daemon(DaemonEvent), +} + +#[derive(Debug)] +pub enum NodeEvent { + PrepareOutputMessage { + dataflow_id: DataflowId, + node_id: NodeId, + output_id: DataId, + metadata: dora_message::Metadata<'static>, + data_len: usize, + reply_sender: oneshot::Sender, + }, + SendOutMessage { + id: MessageId, + reply_sender: oneshot::Sender, + }, + Drop(DropEvent), +} + +pub enum DaemonEvent { + SentOut { + data: SharedMemSample, + drop_tokens: Vec, + }, +} +impl fmt::Debug for DaemonEvent { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::SentOut { + data: _, + drop_tokens, + } => f + .debug_struct("SentOut") + .field("data", &"[..]") + .field("drop_tokens", drop_tokens) + .finish(), + } + } +} + +struct PreparedMessage { + dataflow_id: DataflowId, + node_id: NodeId, + output_id: DataId, + metadata: dora_message::Metadata<'static>, + data: Option<(ShmemHandle, usize)>, +} + +struct ShmemHandle(Shmem); + +unsafe impl Send for ShmemHandle {} +unsafe impl Sync for ShmemHandle {} diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 38b181ae..47d929bd 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,4 +1,4 @@ -use crate::{listener::listener_loop, DoraEvent, Event}; +use crate::{listener::listener_loop, shared_mem_handler, DoraEvent, Event}; use dora_core::{ daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, @@ -14,7 +14,8 @@ use tokio::sync::mpsc; pub async fn spawn_node( dataflow_id: DataflowId, params: SpawnNodeParams, - events_tx: mpsc::Sender, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, ) -> eyre::Result<()> { let SpawnNodeParams { node_id, @@ -57,16 +58,18 @@ pub async fn spawn_node( { let server = unsafe { ShmemServer::new(daemon_control_region) } .wrap_err("failed to create control server")?; - let events_tx = events_tx.clone(); - tokio::task::spawn_blocking(move || listener_loop(server, events_tx)); + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::task::spawn_blocking(move || listener_loop(server, daemon_tx, shmem_handler_tx)); } { let server = unsafe { ShmemServer::new(daemon_events_region) } .wrap_err("failed to create events server")?; let event_loop_node_id = format!("{dataflow_id}/{node_id}"); - let events_tx = events_tx.clone(); + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); tokio::task::spawn_blocking(move || { - listener_loop(server, events_tx); + listener_loop(server, daemon_tx, shmem_handler_tx); tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); }); } @@ -115,7 +118,7 @@ pub async fn spawn_node( node_id: node_id_cloned, result, }; - let _ = events_tx.send(event.into()).await; + let _ = daemon_tx.send(event.into()).await; }); Ok(()) } From a24d7828e936be7006ae4ce999ba3e8a69d05466 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jan 2023 18:27:49 +0100 Subject: [PATCH 101/225] Fix: The drop tokens might reach the `SharedMemHandler` before the `SentOut` message --- binaries/daemon/src/shared_mem_handler.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 2a2e1bfb..d584a546 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -1,5 +1,9 @@ use core::fmt; -use std::{collections::HashMap, sync::Arc, time::Instant}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, + time::Instant, +}; use dora_core::{ config::{DataId, NodeId}, @@ -19,6 +23,7 @@ pub struct SharedMemHandler { events_tx: Sender, prepared_messages: HashMap, sent_out_shared_memory: HashMap>, + dropped: HashSet, } impl SharedMemHandler { @@ -27,6 +32,7 @@ impl SharedMemHandler { events_tx, prepared_messages: HashMap::new(), sent_out_shared_memory: HashMap::new(), + dropped: HashSet::new(), } } @@ -86,7 +92,9 @@ impl SharedMemHandler { }); } } - None => tracing::warn!("received unknown drop token {token:?}"), + None => { + self.dropped.insert(token); + } } } } @@ -180,8 +188,12 @@ impl SharedMemHandler { // keep shared memory alive until we received all drop tokens let memory = Arc::new(data.shared_memory); for drop_token in drop_tokens { - self.sent_out_shared_memory - .insert(drop_token, memory.clone()); + if self.dropped.remove(&drop_token) { + // this token was already dropped -> ignore + } else { + self.sent_out_shared_memory + .insert(drop_token, memory.clone()); + } } } } From 0e95b0868a8fd4f74d77e3ddcf0b1d37718f2d8a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jan 2023 18:46:17 +0100 Subject: [PATCH 102/225] Fix: Report correct length in `SharedMemSample` --- binaries/daemon/src/shared_mem_handler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index d584a546..973fe4b1 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -220,7 +220,7 @@ impl SharedMemSample { } pub fn len(&self) -> usize { - self.shared_memory.0.len() + self.len } } From cf95078ca12f39dcfb00beea78ca8d450c8eebe9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 20 Jan 2023 14:01:52 +0100 Subject: [PATCH 103/225] Extract shared memory server/client into separate crate To allow for individual testing and benchmarking. This will also enable reuse of the library in other projects. --- Cargo.lock | 48 +++++++++++-------- Cargo.toml | 1 + apis/rust/node/Cargo.toml | 3 +- apis/rust/node/src/daemon.rs | 9 ++-- apis/rust/node/src/lib.rs | 2 +- binaries/daemon/Cargo.toml | 2 +- binaries/daemon/src/listener.rs | 2 +- binaries/daemon/src/shared_mem_handler.rs | 2 +- binaries/daemon/src/spawn.rs | 3 +- libraries/core/Cargo.toml | 3 -- libraries/core/src/lib.rs | 1 - libraries/shared-memory-server/Cargo.toml | 15 ++++++ .../src}/channel.rs | 9 +++- .../src/lib.rs} | 10 ++-- 14 files changed, 68 insertions(+), 42 deletions(-) create mode 100644 libraries/shared-memory-server/Cargo.toml rename libraries/{core/src/shared_memory => shared-memory-server/src}/channel.rs (96%) rename libraries/{core/src/shared_memory/mod.rs => shared-memory-server/src/lib.rs} (88%) diff --git a/Cargo.lock b/Cargo.lock index 7d0ebed2..0a05d4eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -983,14 +983,11 @@ dependencies = [ name = "dora-core" version = "0.1.2" dependencies = [ - "bincode", "dora-message", "eyre", "once_cell", - "raw_sync", "serde", "serde_yaml 0.9.11", - "shared_memory", "tracing", "uuid 1.2.1", "which", @@ -1012,7 +1009,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml 0.8.23", - "shared_memory", + "shared-memory-server", "tokio", "tokio-stream", "tracing", @@ -1077,11 +1074,10 @@ dependencies = [ "eyre", "flume", "once_cell", - "raw_sync", "serde", "serde_json", "serde_yaml 0.8.23", - "shared_memory", + "shared-memory-server", "thiserror", "tokio", "tracing", @@ -2871,9 +2867,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.43" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" dependencies = [ "unicode-ident", ] @@ -3482,18 +3478,18 @@ checksum = "d65bd28f48be7196d222d95b9243287f48d27aca604e08497513019ff0502cc4" [[package]] name = "serde" -version = "1.0.144" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.144" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94ed3a816fb1d101812f83e789f888322c34e291f894f19590dc310963e87a00" +checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" dependencies = [ "proc-macro2", "quote", @@ -3581,6 +3577,18 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shared-memory-server" +version = "0.1.2" +dependencies = [ + "bincode", + "eyre", + "raw_sync", + "serde", + "shared_memory", + "tracing", +] + [[package]] name = "shared_memory" version = "0.12.0" @@ -3711,9 +3719,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "syn" -version = "1.0.99" +version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" dependencies = [ "proc-macro2", "quote", @@ -4049,9 +4057,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.36" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if 1.0.0", "log", @@ -4062,9 +4070,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" +checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", @@ -4073,9 +4081,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.29" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ "once_cell", "valuable", diff --git a/Cargo.toml b/Cargo.toml index 595fdc34..0aef9722 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ members = [ "libraries/communication-layer/*", "libraries/core", "libraries/message", + "libraries/shared-memory-server", "libraries/extensions/download", "libraries/extensions/telemetry/*", "libraries/extensions/zenoh-logger", diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index d685f34d..e62185f4 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -22,8 +22,7 @@ uuid = { version = "1.1.2", features = ["v4"] } capnp = "0.14.11" dora-message = { path = "../../../libraries/message" } dora-core = { path = "../../../libraries/core" } -shared_memory = "0.12.0" -raw_sync = "0.1.5" +shared-memory-server = { path = "../../../libraries/shared-memory-server" } [dev-dependencies] tokio = { version = "1.17.0", features = ["rt"] } diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 424c3ac4..4f6a7a3c 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,12 +1,15 @@ use dora_core::{ config::{DataId, NodeId}, daemon_messages::{DaemonReply, DaemonRequest, DataflowId, NodeEvent}, - shared_memory::ShmemClient, }; use dora_message::Metadata; use eyre::{bail, eyre, Context}; -use shared_memory::{Shmem, ShmemConf}; -use std::{marker::PhantomData, thread::JoinHandle, time::Duration}; +use shared_memory_server::{Shmem, ShmemClient, ShmemConf}; +use std::{ + marker::PhantomData, + thread::JoinHandle, + time::{Duration, Instant}, +}; pub struct DaemonConnection { pub control_channel: ControlChannel, diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 3ce43f6e..8f7087db 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -9,7 +9,7 @@ use dora_core::{ pub use dora_message::{uhlc, Metadata, MetadataParameters}; use eyre::WrapErr; pub use flume::Receiver; -use shared_memory::ShmemConf; +use shared_memory_server::ShmemConf; pub mod daemon; diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index e6f75685..6be78b6a 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -14,7 +14,6 @@ tracing-subscriber = "0.3.15" futures-concurrency = "7.0.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.86" -shared_memory = "0.12.0" dora-core = { path = "../../libraries/core" } dora-message = { path = "../../libraries/message" } flume = "0.10.14" @@ -23,3 +22,4 @@ serde_yaml = "0.8.23" uuid = { version = "1.1.2", features = ["v4"] } futures = "0.3.25" clap = { version = "3.1.8", features = ["derive"] } +shared-memory-server = { path = "../../libraries/shared-memory-server" } diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index c1012c1c..5f4fd949 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -2,9 +2,9 @@ use crate::{shared_mem_handler, DaemonNodeEvent, Event}; use dora_core::{ config::NodeId, daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, - shared_memory::ShmemServer, }; use eyre::{eyre, Context}; +use shared_memory_server::ShmemServer; use tokio::sync::{mpsc, oneshot}; #[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 973fe4b1..20cfa9cf 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -13,7 +13,7 @@ use eyre::{eyre, Context}; use flume::{Receiver, Sender}; use futures::StreamExt; use futures_concurrency::stream::Merge; -use shared_memory::{Shmem, ShmemConf}; +use shared_memory_server::{Shmem, ShmemConf}; use tokio::sync::oneshot; use uuid::Uuid; diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 47d929bd..bda1f97e 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -2,11 +2,10 @@ use crate::{listener::listener_loop, shared_mem_handler, DoraEvent, Event}; use dora_core::{ daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, descriptor::{resolve_path, source_is_url}, - shared_memory::ShmemServer, }; use dora_download::download_file; use eyre::{eyre, WrapErr}; -use shared_memory::ShmemConf; +use shared_memory_server::{ShmemConf, ShmemServer}; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; use tokio::sync::mpsc; diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 6c7b347e..b4651fec 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -15,7 +15,4 @@ zenoh-config = { git = "https://github.com/eclipse-zenoh/zenoh.git", rev = "79a1 which = "4.3.0" uuid = { version = "1.2.1", features = ["serde"] } dora-message = { path = "../message" } -shared_memory = "0.12.0" -bincode = "1.3.3" -raw_sync = "0.1.5" tracing = "0.1" diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index 52e82393..a96517dc 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -8,7 +8,6 @@ pub mod config; pub mod coordinator_messages; pub mod daemon_messages; pub mod descriptor; -pub mod shared_memory; pub mod topics; pub fn adjust_shared_library_path(path: &Path) -> Result { diff --git a/libraries/shared-memory-server/Cargo.toml b/libraries/shared-memory-server/Cargo.toml new file mode 100644 index 00000000..3fefc942 --- /dev/null +++ b/libraries/shared-memory-server/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "shared-memory-server" +version.workspace = true +edition = "2021" +license = "Apache-2.0" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +eyre = "0.6.8" +serde = { version = "1.0.152", features = ["derive"] } +shared_memory = "0.12.0" +raw_sync = "0.1.5" +bincode = "1.3.3" +tracing = "0.1.37" diff --git a/libraries/core/src/shared_memory/channel.rs b/libraries/shared-memory-server/src/channel.rs similarity index 96% rename from libraries/core/src/shared_memory/channel.rs rename to libraries/shared-memory-server/src/channel.rs index 10643122..6f911b8e 100644 --- a/libraries/core/src/shared_memory/channel.rs +++ b/libraries/shared-memory-server/src/channel.rs @@ -5,7 +5,7 @@ use shared_memory::Shmem; use std::{ mem, slice, sync::atomic::{AtomicBool, AtomicU64}, - time::Duration, + time::{Duration, Instant}, }; pub struct ShmemChannel { @@ -81,12 +81,17 @@ impl ShmemChannel { }) } - pub fn send(&mut self, value: &T) -> eyre::Result<()> + pub fn send(&mut self, value: &T, start: Instant) -> eyre::Result<()> where T: Serialize + std::fmt::Debug, { let msg = bincode::serialize(value).wrap_err("failed to serialize value")?; + let elapsed = start.elapsed(); + if elapsed.as_micros() > 1 { + tracing::debug!("before send: {elapsed:?}"); + } + self.send_raw(&msg) } diff --git a/libraries/core/src/shared_memory/mod.rs b/libraries/shared-memory-server/src/lib.rs similarity index 88% rename from libraries/core/src/shared_memory/mod.rs rename to libraries/shared-memory-server/src/lib.rs index d230f535..0e482c8b 100644 --- a/libraries/core/src/shared_memory/mod.rs +++ b/libraries/shared-memory-server/src/lib.rs @@ -1,9 +1,9 @@ use self::channel::ShmemChannel; use eyre::{eyre, Context}; use serde::{Deserialize, Serialize}; -use shared_memory::Shmem; +pub use shared_memory::{Shmem, ShmemConf}; use std::marker::PhantomData; -use std::time::Duration; +use std::time::{Duration, Instant}; mod channel; @@ -40,7 +40,7 @@ impl ShmemServer { U: Serialize + std::fmt::Debug, { assert!(self.reply_expected); - self.channel.send(value)?; + self.channel.send(value, Instant::now())?; self.reply_expected = false; Ok(()) } @@ -61,13 +61,13 @@ impl ShmemClient { }) } - pub fn request(&mut self, value: &T) -> eyre::Result + pub fn request(&mut self, value: &T, start: Instant) -> eyre::Result where T: Serialize + std::fmt::Debug, U: for<'a> Deserialize<'a> + std::fmt::Debug, { self.channel - .send(value) + .send(value, start) .wrap_err("failed to send request")?; self.channel .receive(self.timeout) From c1346894b31e91de8feadbc3cca4a2315576cac8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 11:12:41 +0100 Subject: [PATCH 104/225] Remove logging of elapsed time --- libraries/shared-memory-server/src/channel.rs | 9 ++------- libraries/shared-memory-server/src/lib.rs | 8 ++++---- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/libraries/shared-memory-server/src/channel.rs b/libraries/shared-memory-server/src/channel.rs index 6f911b8e..10643122 100644 --- a/libraries/shared-memory-server/src/channel.rs +++ b/libraries/shared-memory-server/src/channel.rs @@ -5,7 +5,7 @@ use shared_memory::Shmem; use std::{ mem, slice, sync::atomic::{AtomicBool, AtomicU64}, - time::{Duration, Instant}, + time::Duration, }; pub struct ShmemChannel { @@ -81,17 +81,12 @@ impl ShmemChannel { }) } - pub fn send(&mut self, value: &T, start: Instant) -> eyre::Result<()> + pub fn send(&mut self, value: &T) -> eyre::Result<()> where T: Serialize + std::fmt::Debug, { let msg = bincode::serialize(value).wrap_err("failed to serialize value")?; - let elapsed = start.elapsed(); - if elapsed.as_micros() > 1 { - tracing::debug!("before send: {elapsed:?}"); - } - self.send_raw(&msg) } diff --git a/libraries/shared-memory-server/src/lib.rs b/libraries/shared-memory-server/src/lib.rs index 0e482c8b..e55dcb17 100644 --- a/libraries/shared-memory-server/src/lib.rs +++ b/libraries/shared-memory-server/src/lib.rs @@ -3,7 +3,7 @@ use eyre::{eyre, Context}; use serde::{Deserialize, Serialize}; pub use shared_memory::{Shmem, ShmemConf}; use std::marker::PhantomData; -use std::time::{Duration, Instant}; +use std::time::Duration; mod channel; @@ -40,7 +40,7 @@ impl ShmemServer { U: Serialize + std::fmt::Debug, { assert!(self.reply_expected); - self.channel.send(value, Instant::now())?; + self.channel.send(value)?; self.reply_expected = false; Ok(()) } @@ -61,13 +61,13 @@ impl ShmemClient { }) } - pub fn request(&mut self, value: &T, start: Instant) -> eyre::Result + pub fn request(&mut self, value: &T) -> eyre::Result where T: Serialize + std::fmt::Debug, U: for<'a> Deserialize<'a> + std::fmt::Debug, { self.channel - .send(value, start) + .send(value) .wrap_err("failed to send request")?; self.channel .receive(self.timeout) From 8383c99823a04a1d97abcbcb32c52c299aa18ba2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 11:14:01 +0100 Subject: [PATCH 105/225] Benchmark latency of ShmemServer/ShmemClient --- .../shared-memory-server/src/bin/bench.rs | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 libraries/shared-memory-server/src/bin/bench.rs diff --git a/libraries/shared-memory-server/src/bin/bench.rs b/libraries/shared-memory-server/src/bin/bench.rs new file mode 100644 index 00000000..1392b7e6 --- /dev/null +++ b/libraries/shared-memory-server/src/bin/bench.rs @@ -0,0 +1,109 @@ +use std::{ + process::Command, + time::{Duration, Instant}, +}; + +use eyre::{eyre, Context, ContextCompat}; +use shared_memory_server::{ShmemClient, ShmemConf, ShmemServer}; + +fn main() -> eyre::Result<()> { + let mut args = std::env::args(); + let executable = args.next().wrap_err("no arg 0")?; + let arg = args.next(); + + match arg.as_deref() { + Some("client") => client(args.next().wrap_err("no shmem id")?)?, + None => server(executable)?, + Some(other) => eyre::bail!("unexpected argument `{other}`"), + } + + Ok(()) +} + +fn server(executable: String) -> eyre::Result<()> { + let shmem = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to create shmem region")?; + let shmem_id = shmem.get_os_id().to_owned(); + let mut server = unsafe { ShmemServer::new(shmem) }.wrap_err("failed to create ShmemServer")?; + + let mut client = Command::new(executable); + client.arg("client").arg(shmem_id); + let mut client_handle = client.spawn().wrap_err("failed to spawn client process")?; + + server_loop(&mut server).wrap_err("server loop failed")?; + + let status = client_handle + .wait() + .wrap_err("failed to wait for client process")?; + + if status.success() { + Ok(()) + } else { + Err(eyre!("client failed")) + } +} + +#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] +enum Request { + Ping, +} + +#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] +enum Reply { + Pong, +} + +fn server_loop(server: &mut ShmemServer) -> eyre::Result<()> { + while let Some(request) = server.listen().wrap_err("failed to receive next message")? { + match request { + Request::Ping => server + .send_reply(&Reply::Pong) + .wrap_err("failed to send reply")?, + } + } + Ok(()) +} + +fn client(shmem_id: String) -> eyre::Result<()> { + let shmem = ShmemConf::new() + .os_id(shmem_id) + .open() + .wrap_err("failed to open shmem region")?; + let mut client = unsafe { ShmemClient::new(shmem, Some(Duration::from_secs(2))) } + .wrap_err("failed to create ShmemClient")?; + + client_loop(&mut client).wrap_err("client loop failed")?; + + Ok(()) +} + +fn client_loop(client: &mut ShmemClient) -> eyre::Result<()> { + let mut latencies = Vec::new(); + for _ in 0..10_000_000 { + let start = Instant::now(); + let reply = client.request(&Request::Ping).wrap_err("ping failed")?; + match reply { + Reply::Pong => { + latencies.push(start.elapsed()); + } + } + } + + let n = latencies.len(); + let avg_latency = latencies.iter().copied().sum::() / n as u32; + let min_latency = latencies.iter().min().unwrap(); + let max_latency = latencies.iter().max().unwrap(); + println!("average latency: {avg_latency:?} (min: {min_latency:?}, max: {max_latency:?})"); + + let mut longest: Vec<_> = latencies.iter().enumerate().map(|(i, d)| (d, i)).collect(); + longest.sort_unstable_by(|a, b| b.cmp(a)); + + println!("\nlongest iterations:"); + for (duration, index) in &longest[..10] { + println!(" {index}: {duration:?}") + } + + Ok(()) +} From accfae6d97af1b88682f439db928184cb75ae339 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 12:10:55 +0100 Subject: [PATCH 106/225] Use flume channels in more places spawn more actors as separate threads --- binaries/daemon/src/lib.rs | 35 +++++------- binaries/daemon/src/listener.rs | 14 ++--- binaries/daemon/src/shared_mem_handler.rs | 67 ++++++++++------------- binaries/daemon/src/spawn.rs | 11 ++-- 4 files changed, 56 insertions(+), 71 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index e0fb514c..847d585d 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -21,12 +21,8 @@ use std::{ time::{Duration, Instant}, }; use tcp_utils::tcp_receive; -use tokio::{ - fs, - sync::{mpsc, oneshot}, - time::timeout, -}; -use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; +use tokio::{fs, sync::oneshot, time::timeout}; +use tokio_stream::{Stream, StreamExt}; use uuid::Uuid; mod coordinator; @@ -38,10 +34,9 @@ mod tcp_utils; pub struct Daemon { running: HashMap, - events_tx: mpsc::Sender, + events_tx: flume::Sender, - shared_memory_handler: flume::Sender, - shared_memory_handler_node: flume::Sender, + shared_memory_handler: flume::Sender, coordinator_addr: Option, machine_id: String, @@ -129,26 +124,22 @@ impl Daemon { machine_id: String, exit_when_done: Option>, ) -> eyre::Result<()> { - let (dora_events_tx, dora_events_rx) = mpsc::channel(5); - let (shared_memory_handler, shared_memory_daemon_rx) = flume::unbounded(); - let (shared_memory_handler_node, shared_memory_node_rx) = flume::bounded(10); + let (dora_events_tx, dora_events_rx) = flume::bounded(50); + let (shared_memory_handler, shared_memory_rx) = flume::unbounded(); let daemon = Self { running: HashMap::new(), events_tx: dora_events_tx, shared_memory_handler, - shared_memory_handler_node, coordinator_addr, machine_id, exit_when_done, }; let (shmem_events_tx, shmem_events_rx) = flume::bounded(5); - tokio::spawn(async { + std::thread::spawn(|| { let mut handler = shared_mem_handler::SharedMemHandler::new(shmem_events_tx); - handler - .run(shared_memory_node_rx, shared_memory_daemon_rx) - .await; + handler.run(shared_memory_rx); }); - let dora_events = ReceiverStream::new(dora_events_rx); + let dora_events = dora_events_rx.into_stream(); let shmem_events = shmem_events_rx.into_stream().map(Event::ShmemHandler); let watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval( Duration::from_secs(5), @@ -307,7 +298,7 @@ impl Daemon { dataflow_id, params, self.events_tx.clone(), - self.shared_memory_handler_node.clone(), + self.shared_memory_handler.clone(), ) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; @@ -328,7 +319,7 @@ impl Daemon { Default::default(), ), }; - if events_tx.send(event.into()).await.is_err() { + if events_tx.send_async(event.into()).await.is_err() { break; } } @@ -582,7 +573,9 @@ impl Daemon { if let Some(data) = data { if let Err(err) = self .shared_memory_handler - .send_async(shared_mem_handler::DaemonEvent::SentOut { data, drop_tokens }) + .send_async( + shared_mem_handler::DaemonEvent::SentOut { data, drop_tokens }.into(), + ) .await .wrap_err("shared mem handler crashed after send out") { diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 5f4fd949..0e9b9882 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -5,13 +5,13 @@ use dora_core::{ }; use eyre::{eyre, Context}; use shared_memory_server::ShmemServer; -use tokio::sync::{mpsc, oneshot}; +use tokio::sync::oneshot; #[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] pub fn listener_loop( mut server: ShmemServer, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, + daemon_tx: flume::Sender, + shmem_handler_tx: flume::Sender, ) { // receive the first message let message = match server @@ -71,8 +71,8 @@ struct Listener { dataflow_id: DataflowId, node_id: NodeId, server: ShmemServer, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, + daemon_tx: flume::Sender, + shmem_handler_tx: flume::Sender, subscribed_events: Option>, } @@ -181,7 +181,7 @@ impl Listener { reply_sender: reply_tx, }; self.daemon_tx - .blocking_send(event) + .send(event) .map_err(|_| eyre!("failed to send event to daemon"))?; let reply = reply .blocking_recv() @@ -198,7 +198,7 @@ impl Listener { fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { self.shmem_handler_tx - .send(event) + .send(event.into()) .map_err(|_| eyre!("failed to send event to shared_mem_handler")) } } diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 20cfa9cf..096cedd1 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -11,8 +11,6 @@ use dora_core::{ }; use eyre::{eyre, Context}; use flume::{Receiver, Sender}; -use futures::StreamExt; -use futures_concurrency::stream::Merge; use shared_memory_server::{Shmem, ShmemConf}; use tokio::sync::oneshot; use uuid::Uuid; @@ -36,38 +34,24 @@ impl SharedMemHandler { } } - pub async fn run( - &mut self, - node_events: Receiver, - daemon_events: Receiver, - ) { - if let Err(err) = self.run_inner(node_events, daemon_events).await { + pub fn run(&mut self, events: Receiver) { + if let Err(err) = self.run_inner(events) { if let Err(send_err) = self .events_tx - .send_async(crate::ShmemHandlerEvent::HandlerError(err)) - .await + .send(crate::ShmemHandlerEvent::HandlerError(err)) { tracing::error!("{send_err:?}"); } } } - pub async fn run_inner( - &mut self, - node_events: Receiver, - daemon_events: Receiver, - ) -> eyre::Result<()> { - let mut events = ( - node_events.stream().map(Event::Node), - daemon_events.stream().map(Event::Daemon), - ) - .merge(); - while let Some(event) = events.next().await { + pub fn run_inner(&mut self, events: Receiver) -> eyre::Result<()> { + while let Ok(event) = events.recv() { let start = Instant::now(); let event_debug = format!("{event:?}"); match event { - Event::Node(event) => self.handle_node_event(event).await?, - Event::Daemon(event) => self.handle_daemon_event(event).await?, + Event::Node(event) => self.handle_node_event(event)?, + Event::Daemon(event) => self.handle_daemon_event(event)?, } let elapsed = start.elapsed(); // if elapsed.as_micros() > 10 { @@ -77,14 +61,14 @@ impl SharedMemHandler { Ok(()) } - async fn handle_node_event(&mut self, event: NodeEvent) -> eyre::Result<()> { + fn handle_node_event(&mut self, event: NodeEvent) -> eyre::Result<()> { match event { NodeEvent::Drop(DropEvent { tokens }) => { for token in tokens { match self.sent_out_shared_memory.remove(&token) { Some(arc) => { if let Ok(shmem) = Arc::try_unwrap(arc) { - tokio::task::spawn_blocking(move || { + std::thread::spawn(move || { tracing::trace!( "freeing shared memory after receiving last drop token" ); @@ -164,16 +148,13 @@ impl SharedMemHandler { len, }); - let send_result = self - .events_tx - .send_async(crate::ShmemHandlerEvent::SendOut { - dataflow_id, - node_id, - output_id, - metadata, - data, - }) - .await; + let send_result = self.events_tx.send(crate::ShmemHandlerEvent::SendOut { + dataflow_id, + node_id, + output_id, + metadata, + data, + }); let _ = reply_sender.send(DaemonReply::Result( send_result.map_err(|_| "daemon is no longer running".into()), )); @@ -182,7 +163,7 @@ impl SharedMemHandler { Ok(()) } - async fn handle_daemon_event(&mut self, event: DaemonEvent) -> eyre::Result<()> { + fn handle_daemon_event(&mut self, event: DaemonEvent) -> eyre::Result<()> { match event { DaemonEvent::SentOut { data, drop_tokens } => { // keep shared memory alive until we received all drop tokens @@ -225,11 +206,23 @@ impl SharedMemSample { } #[derive(Debug)] -enum Event { +pub enum Event { Node(NodeEvent), Daemon(DaemonEvent), } +impl From for Event { + fn from(event: NodeEvent) -> Self { + Self::Node(event) + } +} + +impl From for Event { + fn from(event: DaemonEvent) -> Self { + Self::Daemon(event) + } +} + #[derive(Debug)] pub enum NodeEvent { PrepareOutputMessage { diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index bda1f97e..de667d71 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -7,14 +7,13 @@ use dora_download::download_file; use eyre::{eyre, WrapErr}; use shared_memory_server::{ShmemConf, ShmemServer}; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; -use tokio::sync::mpsc; #[tracing::instrument] pub async fn spawn_node( dataflow_id: DataflowId, params: SpawnNodeParams, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, + daemon_tx: flume::Sender, + shmem_handler_tx: flume::Sender, ) -> eyre::Result<()> { let SpawnNodeParams { node_id, @@ -59,7 +58,7 @@ pub async fn spawn_node( .wrap_err("failed to create control server")?; let daemon_tx = daemon_tx.clone(); let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::task::spawn_blocking(move || listener_loop(server, daemon_tx, shmem_handler_tx)); + std::thread::spawn(move || listener_loop(server, daemon_tx, shmem_handler_tx)); } { let server = unsafe { ShmemServer::new(daemon_events_region) } @@ -67,7 +66,7 @@ pub async fn spawn_node( let event_loop_node_id = format!("{dataflow_id}/{node_id}"); let daemon_tx = daemon_tx.clone(); let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::task::spawn_blocking(move || { + std::thread::spawn(move || { listener_loop(server, daemon_tx, shmem_handler_tx); tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); }); @@ -117,7 +116,7 @@ pub async fn spawn_node( node_id: node_id_cloned, result, }; - let _ = daemon_tx.send(event.into()).await; + let _ = daemon_tx.send_async(event.into()).await; }); Ok(()) } From 6438ef74137569c2176f628a160e4528f36f20ce Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 12:38:18 +0100 Subject: [PATCH 107/225] Add special handling for zero-sized messages to avoid one roundtrip --- apis/rust/node/src/daemon.rs | 30 ++++++++++++++++----- apis/rust/node/src/lib.rs | 21 ++++++++------- binaries/daemon/src/listener.rs | 32 ++++++++++++++++++----- binaries/daemon/src/shared_mem_handler.rs | 29 ++++++++++++++++++-- libraries/core/src/daemon_messages.rs | 6 ++++- 5 files changed, 92 insertions(+), 26 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 4f6a7a3c..a28c1f04 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -5,11 +5,7 @@ use dora_core::{ use dora_message::Metadata; use eyre::{bail, eyre, Context}; use shared_memory_server::{Shmem, ShmemClient, ShmemConf}; -use std::{ - marker::PhantomData, - thread::JoinHandle, - time::{Duration, Instant}, -}; +use std::{marker::PhantomData, thread::JoinHandle, time::Duration}; pub struct DaemonConnection { pub control_channel: ControlChannel, @@ -102,10 +98,10 @@ impl ControlChannel { } } - pub fn send_message(&mut self, sample: MessageSample) -> eyre::Result<()> { + pub fn send_prepared_message(&mut self, sample: MessageSample) -> eyre::Result<()> { let reply = self .channel - .request(&DaemonRequest::SendOutMessage { id: sample.id }) + .request(&DaemonRequest::SendPreparedMessage { id: sample.id }) .wrap_err("failed to send SendOutMessage request to dora-daemon")?; match reply { dora_core::daemon_messages::DaemonReply::Result(result) => { @@ -114,6 +110,26 @@ impl ControlChannel { other => bail!("unexpected SendOutMessage reply: {other:?}"), } } + + pub fn send_empty_message( + &mut self, + output_id: DataId, + metadata: dora_message::Metadata<'static>, + ) -> eyre::Result<()> { + let reply = self + .channel + .request(&DaemonRequest::SendEmptyMessage { + output_id, + metadata, + }) + .wrap_err("failed to send SendEmptyMessage request to dora-daemon")?; + match reply { + dora_core::daemon_messages::DaemonReply::Result(result) => { + result.map_err(|err| eyre!(err)) + } + other => bail!("unexpected SendEmptyMessage reply: {other:?}"), + } + } } fn register( diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 8f7087db..92b11432 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -80,13 +80,12 @@ impl DoraNode { } let metadata = Metadata::from_parameters(self.hlc.new_timestamp(), parameters.into_owned()); - let sample = self - .control_channel - .prepare_message(output_id.clone(), metadata, data_len) - .wrap_err("failed to prepare sample for output message")?; - - // map shared memory and fill in data if data_len > 0 { + let sample = self + .control_channel + .prepare_message(output_id.clone(), metadata, data_len) + .wrap_err("failed to prepare sample for output message")?; + // map shared memory and fill in data let mut shared_memory = ShmemConf::new() .os_id(&sample.id) .open() @@ -94,13 +93,17 @@ impl DoraNode { let raw = unsafe { shared_memory.as_slice_mut() }; data(&mut raw[..data_len]); + + self.control_channel + .send_prepared_message(sample) + .wrap_err_with(|| format!("failed to send data for output {output_id}"))?; } else { data(&mut []); + self.control_channel + .send_empty_message(output_id.clone(), metadata) + .wrap_err_with(|| format!("failed to send output {output_id}"))?; } - self.control_channel - .send_message(sample) - .wrap_err_with(|| format!("failed to send data for output {output_id}"))?; Ok(()) } diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 0e9b9882..cf80cc0a 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -126,15 +126,15 @@ impl Listener { reply_sender, }; self.send_shared_memory_event(event)?; - self.send_reply( - &reply - .blocking_recv() - .wrap_err("failed to receive prepare output reply")?, - )?; + let reply = reply + .blocking_recv() + .wrap_err("failed to receive prepare output reply")?; + // tracing::debug!("prepare latency: {:?}", start.elapsed()?); + self.send_reply(&reply)?; } - DaemonRequest::SendOutMessage { id } => { + DaemonRequest::SendPreparedMessage { id } => { let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::SendOutMessage { id, reply_sender }; + let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; self.send_shared_memory_event(event)?; self.send_reply( &reply @@ -142,6 +142,24 @@ impl Listener { .wrap_err("failed to receive send output reply")?, )?; } + DaemonRequest::SendEmptyMessage { + output_id, + metadata, + } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::SendEmptyMessage { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + reply_sender, + }; + self.send_shared_memory_event(event)?; + let reply = reply + .blocking_recv() + .wrap_err("failed to receive send_empty_message reply")?; + self.send_reply(&reply)?; + } DaemonRequest::Subscribe => { let (tx, rx) = flume::bounded(10); self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 096cedd1..2a68e945 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -131,7 +131,7 @@ impl SharedMemHandler { self.prepared_messages.remove(&id); } } - NodeEvent::SendOutMessage { id, reply_sender } => { + NodeEvent::SendPreparedMessage { id, reply_sender } => { let message = self .prepared_messages .remove(&id) @@ -159,6 +159,24 @@ impl SharedMemHandler { send_result.map_err(|_| "daemon is no longer running".into()), )); } + NodeEvent::SendEmptyMessage { + dataflow_id, + node_id, + output_id, + metadata, + reply_sender, + } => { + let send_result = self.events_tx.send(crate::ShmemHandlerEvent::SendOut { + dataflow_id, + node_id, + output_id, + metadata, + data: None, + }); + let _ = reply_sender.send(DaemonReply::Result( + send_result.map_err(|_| "daemon is no longer running".into()), + )); + } } Ok(()) } @@ -233,10 +251,17 @@ pub enum NodeEvent { data_len: usize, reply_sender: oneshot::Sender, }, - SendOutMessage { + SendPreparedMessage { id: MessageId, reply_sender: oneshot::Sender, }, + SendEmptyMessage { + dataflow_id: DataflowId, + node_id: NodeId, + output_id: DataId, + metadata: dora_message::Metadata<'static>, + reply_sender: oneshot::Sender, + }, Drop(DropEvent), } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 04b4a8a1..25ab4f17 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -28,9 +28,13 @@ pub enum DaemonRequest { metadata: Metadata<'static>, data_len: usize, }, - SendOutMessage { + SendPreparedMessage { id: SharedMemoryId, }, + SendEmptyMessage { + output_id: DataId, + metadata: Metadata<'static>, + }, Stopped, NextEvent { drop_tokens: Vec, From c663f58d16a28b5249b13637a342005811e5ade1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 13:55:47 +0100 Subject: [PATCH 108/225] Revert "Use flume channels in more places spawn more actors as separate threads" This reverts commit accfae6d97af1b88682f439db928184cb75ae339. --- binaries/daemon/src/lib.rs | 35 +++++++----- binaries/daemon/src/listener.rs | 14 ++--- binaries/daemon/src/shared_mem_handler.rs | 67 +++++++++++++---------- binaries/daemon/src/spawn.rs | 11 ++-- 4 files changed, 71 insertions(+), 56 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 847d585d..e0fb514c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -21,8 +21,12 @@ use std::{ time::{Duration, Instant}, }; use tcp_utils::tcp_receive; -use tokio::{fs, sync::oneshot, time::timeout}; -use tokio_stream::{Stream, StreamExt}; +use tokio::{ + fs, + sync::{mpsc, oneshot}, + time::timeout, +}; +use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use uuid::Uuid; mod coordinator; @@ -34,9 +38,10 @@ mod tcp_utils; pub struct Daemon { running: HashMap, - events_tx: flume::Sender, + events_tx: mpsc::Sender, - shared_memory_handler: flume::Sender, + shared_memory_handler: flume::Sender, + shared_memory_handler_node: flume::Sender, coordinator_addr: Option, machine_id: String, @@ -124,22 +129,26 @@ impl Daemon { machine_id: String, exit_when_done: Option>, ) -> eyre::Result<()> { - let (dora_events_tx, dora_events_rx) = flume::bounded(50); - let (shared_memory_handler, shared_memory_rx) = flume::unbounded(); + let (dora_events_tx, dora_events_rx) = mpsc::channel(5); + let (shared_memory_handler, shared_memory_daemon_rx) = flume::unbounded(); + let (shared_memory_handler_node, shared_memory_node_rx) = flume::bounded(10); let daemon = Self { running: HashMap::new(), events_tx: dora_events_tx, shared_memory_handler, + shared_memory_handler_node, coordinator_addr, machine_id, exit_when_done, }; let (shmem_events_tx, shmem_events_rx) = flume::bounded(5); - std::thread::spawn(|| { + tokio::spawn(async { let mut handler = shared_mem_handler::SharedMemHandler::new(shmem_events_tx); - handler.run(shared_memory_rx); + handler + .run(shared_memory_node_rx, shared_memory_daemon_rx) + .await; }); - let dora_events = dora_events_rx.into_stream(); + let dora_events = ReceiverStream::new(dora_events_rx); let shmem_events = shmem_events_rx.into_stream().map(Event::ShmemHandler); let watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval( Duration::from_secs(5), @@ -298,7 +307,7 @@ impl Daemon { dataflow_id, params, self.events_tx.clone(), - self.shared_memory_handler.clone(), + self.shared_memory_handler_node.clone(), ) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; @@ -319,7 +328,7 @@ impl Daemon { Default::default(), ), }; - if events_tx.send_async(event.into()).await.is_err() { + if events_tx.send(event.into()).await.is_err() { break; } } @@ -573,9 +582,7 @@ impl Daemon { if let Some(data) = data { if let Err(err) = self .shared_memory_handler - .send_async( - shared_mem_handler::DaemonEvent::SentOut { data, drop_tokens }.into(), - ) + .send_async(shared_mem_handler::DaemonEvent::SentOut { data, drop_tokens }) .await .wrap_err("shared mem handler crashed after send out") { diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index cf80cc0a..12360d09 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -5,13 +5,13 @@ use dora_core::{ }; use eyre::{eyre, Context}; use shared_memory_server::ShmemServer; -use tokio::sync::oneshot; +use tokio::sync::{mpsc, oneshot}; #[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] pub fn listener_loop( mut server: ShmemServer, - daemon_tx: flume::Sender, - shmem_handler_tx: flume::Sender, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, ) { // receive the first message let message = match server @@ -71,8 +71,8 @@ struct Listener { dataflow_id: DataflowId, node_id: NodeId, server: ShmemServer, - daemon_tx: flume::Sender, - shmem_handler_tx: flume::Sender, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, subscribed_events: Option>, } @@ -199,7 +199,7 @@ impl Listener { reply_sender: reply_tx, }; self.daemon_tx - .send(event) + .blocking_send(event) .map_err(|_| eyre!("failed to send event to daemon"))?; let reply = reply .blocking_recv() @@ -216,7 +216,7 @@ impl Listener { fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { self.shmem_handler_tx - .send(event.into()) + .send(event) .map_err(|_| eyre!("failed to send event to shared_mem_handler")) } } diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 2a68e945..db598556 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -11,6 +11,8 @@ use dora_core::{ }; use eyre::{eyre, Context}; use flume::{Receiver, Sender}; +use futures::StreamExt; +use futures_concurrency::stream::Merge; use shared_memory_server::{Shmem, ShmemConf}; use tokio::sync::oneshot; use uuid::Uuid; @@ -34,24 +36,38 @@ impl SharedMemHandler { } } - pub fn run(&mut self, events: Receiver) { - if let Err(err) = self.run_inner(events) { + pub async fn run( + &mut self, + node_events: Receiver, + daemon_events: Receiver, + ) { + if let Err(err) = self.run_inner(node_events, daemon_events).await { if let Err(send_err) = self .events_tx - .send(crate::ShmemHandlerEvent::HandlerError(err)) + .send_async(crate::ShmemHandlerEvent::HandlerError(err)) + .await { tracing::error!("{send_err:?}"); } } } - pub fn run_inner(&mut self, events: Receiver) -> eyre::Result<()> { - while let Ok(event) = events.recv() { + pub async fn run_inner( + &mut self, + node_events: Receiver, + daemon_events: Receiver, + ) -> eyre::Result<()> { + let mut events = ( + node_events.stream().map(Event::Node), + daemon_events.stream().map(Event::Daemon), + ) + .merge(); + while let Some(event) = events.next().await { let start = Instant::now(); let event_debug = format!("{event:?}"); match event { - Event::Node(event) => self.handle_node_event(event)?, - Event::Daemon(event) => self.handle_daemon_event(event)?, + Event::Node(event) => self.handle_node_event(event).await?, + Event::Daemon(event) => self.handle_daemon_event(event).await?, } let elapsed = start.elapsed(); // if elapsed.as_micros() > 10 { @@ -61,14 +77,14 @@ impl SharedMemHandler { Ok(()) } - fn handle_node_event(&mut self, event: NodeEvent) -> eyre::Result<()> { + async fn handle_node_event(&mut self, event: NodeEvent) -> eyre::Result<()> { match event { NodeEvent::Drop(DropEvent { tokens }) => { for token in tokens { match self.sent_out_shared_memory.remove(&token) { Some(arc) => { if let Ok(shmem) = Arc::try_unwrap(arc) { - std::thread::spawn(move || { + tokio::task::spawn_blocking(move || { tracing::trace!( "freeing shared memory after receiving last drop token" ); @@ -148,13 +164,16 @@ impl SharedMemHandler { len, }); - let send_result = self.events_tx.send(crate::ShmemHandlerEvent::SendOut { - dataflow_id, - node_id, - output_id, - metadata, - data, - }); + let send_result = self + .events_tx + .send_async(crate::ShmemHandlerEvent::SendOut { + dataflow_id, + node_id, + output_id, + metadata, + data, + }) + .await; let _ = reply_sender.send(DaemonReply::Result( send_result.map_err(|_| "daemon is no longer running".into()), )); @@ -181,7 +200,7 @@ impl SharedMemHandler { Ok(()) } - fn handle_daemon_event(&mut self, event: DaemonEvent) -> eyre::Result<()> { + async fn handle_daemon_event(&mut self, event: DaemonEvent) -> eyre::Result<()> { match event { DaemonEvent::SentOut { data, drop_tokens } => { // keep shared memory alive until we received all drop tokens @@ -224,23 +243,11 @@ impl SharedMemSample { } #[derive(Debug)] -pub enum Event { +enum Event { Node(NodeEvent), Daemon(DaemonEvent), } -impl From for Event { - fn from(event: NodeEvent) -> Self { - Self::Node(event) - } -} - -impl From for Event { - fn from(event: DaemonEvent) -> Self { - Self::Daemon(event) - } -} - #[derive(Debug)] pub enum NodeEvent { PrepareOutputMessage { diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index de667d71..bda1f97e 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -7,13 +7,14 @@ use dora_download::download_file; use eyre::{eyre, WrapErr}; use shared_memory_server::{ShmemConf, ShmemServer}; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; +use tokio::sync::mpsc; #[tracing::instrument] pub async fn spawn_node( dataflow_id: DataflowId, params: SpawnNodeParams, - daemon_tx: flume::Sender, - shmem_handler_tx: flume::Sender, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, ) -> eyre::Result<()> { let SpawnNodeParams { node_id, @@ -58,7 +59,7 @@ pub async fn spawn_node( .wrap_err("failed to create control server")?; let daemon_tx = daemon_tx.clone(); let shmem_handler_tx = shmem_handler_tx.clone(); - std::thread::spawn(move || listener_loop(server, daemon_tx, shmem_handler_tx)); + tokio::task::spawn_blocking(move || listener_loop(server, daemon_tx, shmem_handler_tx)); } { let server = unsafe { ShmemServer::new(daemon_events_region) } @@ -66,7 +67,7 @@ pub async fn spawn_node( let event_loop_node_id = format!("{dataflow_id}/{node_id}"); let daemon_tx = daemon_tx.clone(); let shmem_handler_tx = shmem_handler_tx.clone(); - std::thread::spawn(move || { + tokio::task::spawn_blocking(move || { listener_loop(server, daemon_tx, shmem_handler_tx); tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); }); @@ -116,7 +117,7 @@ pub async fn spawn_node( node_id: node_id_cloned, result, }; - let _ = daemon_tx.send_async(event.into()).await; + let _ = daemon_tx.send(event.into()).await; }); Ok(()) } From 148b52d47bbeac88b3b05f1807e1102d4ae3a473 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 14:37:27 +0100 Subject: [PATCH 109/225] Skip drop message if there are no drop tokens --- binaries/daemon/src/listener.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 12360d09..660eab7c 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -166,10 +166,12 @@ impl Listener { self.subscribed_events = Some(rx); } DaemonRequest::NextEvent { drop_tokens } => { - let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { - tokens: drop_tokens, - }); - self.send_shared_memory_event(drop_event)?; + if !drop_tokens.is_empty() { + let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { + tokens: drop_tokens, + }); + self.send_shared_memory_event(drop_event)?; + } let reply = match self.subscribed_events.as_mut() { Some(events) => match events.recv() { From e3a82d3b7ddaebb6a97562f2062c59d8791cb059 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 14:54:20 +0100 Subject: [PATCH 110/225] Skip shared memory handler for empty messages --- binaries/daemon/src/listener.rs | 22 ++++++++++++-------- binaries/daemon/src/shared_mem_handler.rs | 25 ----------------------- 2 files changed, 13 insertions(+), 34 deletions(-) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 660eab7c..80fdd24f 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -146,19 +146,17 @@ impl Listener { output_id, metadata, } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::SendEmptyMessage { + let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { dataflow_id: self.dataflow_id, node_id: self.node_id.clone(), output_id, metadata, - reply_sender, - }; - self.send_shared_memory_event(event)?; - let reply = reply - .blocking_recv() - .wrap_err("failed to receive send_empty_message reply")?; - self.send_reply(&reply)?; + data: None, + }); + let result = self + .send_daemon_event(event) + .map_err(|_| "failed to receive send_empty_message reply".to_owned()); + self.send_reply(&DaemonReply::Result(result))?; } DaemonRequest::Subscribe => { let (tx, rx) = flume::bounded(10); @@ -221,4 +219,10 @@ impl Listener { .send(event) .map_err(|_| eyre!("failed to send event to shared_mem_handler")) } + + fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { + self.daemon_tx + .blocking_send(event) + .map_err(|_| eyre!("failed to send event to daemon")) + } } diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index db598556..86f955ce 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -178,24 +178,6 @@ impl SharedMemHandler { send_result.map_err(|_| "daemon is no longer running".into()), )); } - NodeEvent::SendEmptyMessage { - dataflow_id, - node_id, - output_id, - metadata, - reply_sender, - } => { - let send_result = self.events_tx.send(crate::ShmemHandlerEvent::SendOut { - dataflow_id, - node_id, - output_id, - metadata, - data: None, - }); - let _ = reply_sender.send(DaemonReply::Result( - send_result.map_err(|_| "daemon is no longer running".into()), - )); - } } Ok(()) } @@ -262,13 +244,6 @@ pub enum NodeEvent { id: MessageId, reply_sender: oneshot::Sender, }, - SendEmptyMessage { - dataflow_id: DataflowId, - node_id: NodeId, - output_id: DataId, - metadata: dora_message::Metadata<'static>, - reply_sender: oneshot::Sender, - }, Drop(DropEvent), } From 26875bd5fef80bdd6cec0a4c8f61bc35ce645f0d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 15:06:55 +0100 Subject: [PATCH 111/225] Limit queue length to 10 events per subscriber Drop oldest message when queue is full. --- binaries/daemon/src/lib.rs | 40 ++++++++++++++++++++++++++++++--- binaries/daemon/src/listener.rs | 10 +++++++-- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index e0fb514c..75cd93d0 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -18,6 +18,7 @@ use std::{ fmt, net::SocketAddr, path::Path, + sync::Weak, time::{Duration, Instant}, }; use tcp_utils::tcp_receive; @@ -348,10 +349,20 @@ impl Daemon { reply_sender: oneshot::Sender, ) -> eyre::Result<()> { match event { - DaemonNodeEvent::Subscribe { event_sender } => { + DaemonNodeEvent::Subscribe { + event_sender, + receiver_handle, + } => { let result = match self.running.get_mut(&dataflow_id) { Some(dataflow) => { - dataflow.subscribe_channels.insert(node_id, event_sender); + dataflow.subscribe_channels.insert( + node_id, + SubscribeChannel { + sender: event_sender, + receiver_handle, + max_queue_len: 10, // TODO: make this configurable + }, + ); Ok(()) } None => Err(format!( @@ -604,7 +615,7 @@ impl Daemon { #[derive(Default)] pub struct RunningDataflow { - subscribe_channels: HashMap>, + subscribe_channels: HashMap, mappings: HashMap>, timers: BTreeMap>, open_inputs: BTreeMap>, @@ -613,6 +624,28 @@ pub struct RunningDataflow { _timer_handles: Vec>, } +struct SubscribeChannel { + sender: flume::Sender, + receiver_handle: Weak>, + max_queue_len: usize, +} + +impl SubscribeChannel { + async fn send_async(&self, event: daemon_messages::NodeEvent) -> eyre::Result<()> { + while self.sender.len() >= self.max_queue_len { + if let Some(receiver) = self.receiver_handle.upgrade() { + if receiver.try_recv().is_ok() { + tracing::debug!("Dropping message because queue is full"); + } + } + } + self.sender + .send_async(event) + .await + .map_err(|_| eyre!("failed to send event")) + } +} + type OutputId = (NodeId, DataId); type InputId = (NodeId, DataId); @@ -646,6 +679,7 @@ pub enum DaemonNodeEvent { Stopped, Subscribe { event_sender: flume::Sender, + receiver_handle: Weak>, }, } diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 80fdd24f..bc0decd5 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use crate::{shared_mem_handler, DaemonNodeEvent, Event}; use dora_core::{ config::NodeId, @@ -73,7 +75,7 @@ struct Listener { server: ShmemServer, daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, - subscribed_events: Option>, + subscribed_events: Option>>, } impl Listener { @@ -160,7 +162,11 @@ impl Listener { } DaemonRequest::Subscribe => { let (tx, rx) = flume::bounded(10); - self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; + let rx = Arc::new(rx); + self.process_daemon_event(DaemonNodeEvent::Subscribe { + event_sender: tx, + receiver_handle: Arc::downgrade(&rx), + })?; self.subscribed_events = Some(rx); } DaemonRequest::NextEvent { drop_tokens } => { From 6c6221f5700ffc3fae464c83fb09f73163e5f4ef Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jan 2023 17:53:52 +0100 Subject: [PATCH 112/225] Only drop input events and free shmem on dropping We don't want to drop other types of events (e.g. stop or input closed) because they might be important for the correct functionality. For this reason, we only drop input events now. To implement this, we use a FIFO buffer in the listener instead of dropping the events on the sender side. This commit also fixes a memory leak that occured when dropping values. We now report the drop tokens properly to the shared memory handler when dropping an input. --- binaries/daemon/src/lib.rs | 40 +----------- binaries/daemon/src/listener.rs | 111 +++++++++++++++++++++++++------- 2 files changed, 89 insertions(+), 62 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 75cd93d0..e0fb514c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -18,7 +18,6 @@ use std::{ fmt, net::SocketAddr, path::Path, - sync::Weak, time::{Duration, Instant}, }; use tcp_utils::tcp_receive; @@ -349,20 +348,10 @@ impl Daemon { reply_sender: oneshot::Sender, ) -> eyre::Result<()> { match event { - DaemonNodeEvent::Subscribe { - event_sender, - receiver_handle, - } => { + DaemonNodeEvent::Subscribe { event_sender } => { let result = match self.running.get_mut(&dataflow_id) { Some(dataflow) => { - dataflow.subscribe_channels.insert( - node_id, - SubscribeChannel { - sender: event_sender, - receiver_handle, - max_queue_len: 10, // TODO: make this configurable - }, - ); + dataflow.subscribe_channels.insert(node_id, event_sender); Ok(()) } None => Err(format!( @@ -615,7 +604,7 @@ impl Daemon { #[derive(Default)] pub struct RunningDataflow { - subscribe_channels: HashMap, + subscribe_channels: HashMap>, mappings: HashMap>, timers: BTreeMap>, open_inputs: BTreeMap>, @@ -624,28 +613,6 @@ pub struct RunningDataflow { _timer_handles: Vec>, } -struct SubscribeChannel { - sender: flume::Sender, - receiver_handle: Weak>, - max_queue_len: usize, -} - -impl SubscribeChannel { - async fn send_async(&self, event: daemon_messages::NodeEvent) -> eyre::Result<()> { - while self.sender.len() >= self.max_queue_len { - if let Some(receiver) = self.receiver_handle.upgrade() { - if receiver.try_recv().is_ok() { - tracing::debug!("Dropping message because queue is full"); - } - } - } - self.sender - .send_async(event) - .await - .map_err(|_| eyre!("failed to send event")) - } -} - type OutputId = (NodeId, DataId); type InputId = (NodeId, DataId); @@ -679,7 +646,6 @@ pub enum DaemonNodeEvent { Stopped, Subscribe { event_sender: flume::Sender, - receiver_handle: Weak>, }, } diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index bc0decd5..1d2fe73f 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::collections::VecDeque; use crate::{shared_mem_handler, DaemonNodeEvent, Event}; use dora_core::{ @@ -49,6 +49,8 @@ pub fn listener_loop( daemon_tx, shmem_handler_tx, subscribed_events: None, + max_queue_len: 10, // TODO: make this configurable + queue: VecDeque::new(), }; match listener.run().wrap_err("listener failed") { Ok(()) => {} @@ -75,13 +77,15 @@ struct Listener { server: ShmemServer, daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, - subscribed_events: Option>>, + subscribed_events: Option>, + max_queue_len: usize, + queue: VecDeque, } impl Listener { fn run(&mut self) -> eyre::Result<()> { loop { - // receive the next message + // receive the next node message let message = match self .server .listen() @@ -101,11 +105,59 @@ impl Listener { continue; } }; + + // handle incoming events + self.handle_events()?; + self.handle_message(message)?; } Ok(()) } + fn handle_events(&mut self) -> eyre::Result<()> { + if let Some(events) = &mut self.subscribed_events { + while let Ok(event) = events.try_recv() { + self.queue.push_back(event); + } + + // drop oldest input events to maintain max queue length queue + let input_event_count = self + .queue + .iter() + .filter(|e| matches!(e, NodeEvent::Input { .. })) + .count(); + let drop_n = input_event_count.saturating_sub(self.max_queue_len); + self.drop_oldest_inputs(drop_n)?; + } + Ok(()) + } + + fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { + let mut drop_tokens = Vec::new(); + for i in 0..number { + // find index of oldest input event + let index = self + .queue + .iter() + .position(|e| matches!(e, NodeEvent::Input { .. })) + .expect(&format!("no input event found in drop iteration {i}")); + + // remove that event + if let Some(event) = self.queue.remove(index) { + tracing::debug!("dropping event {event:?}"); + + if let NodeEvent::Input { + data: Some(data), .. + } = event + { + drop_tokens.push(data.drop_token); + } + } + } + self.report_drop_tokens(drop_tokens)?; + Ok(()) + } + fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { match message { DaemonRequest::Register { .. } => { @@ -161,33 +213,29 @@ impl Listener { self.send_reply(&DaemonReply::Result(result))?; } DaemonRequest::Subscribe => { - let (tx, rx) = flume::bounded(10); - let rx = Arc::new(rx); - self.process_daemon_event(DaemonNodeEvent::Subscribe { - event_sender: tx, - receiver_handle: Arc::downgrade(&rx), - })?; + let (tx, rx) = flume::bounded(100); + self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; self.subscribed_events = Some(rx); } DaemonRequest::NextEvent { drop_tokens } => { - if !drop_tokens.is_empty() { - let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { - tokens: drop_tokens, - }); - self.send_shared_memory_event(drop_event)?; - } + self.report_drop_tokens(drop_tokens)?; - let reply = match self.subscribed_events.as_mut() { - Some(events) => match events.recv() { - Ok(event) => DaemonReply::NodeEvent(event), - Err(flume::RecvError::Disconnected) => DaemonReply::Closed, - }, - None => { - DaemonReply::Result(Err("Ignoring event request because no subscribe \ - message was sent yet" - .into())) + // try to take the latest queued event first + let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); + let reply = queued_event.unwrap_or_else(|| { + match self.subscribed_events.as_mut() { + // wait for next event + Some(events) => match events.recv() { + Ok(event) => DaemonReply::NodeEvent(event), + Err(flume::RecvError::Disconnected) => DaemonReply::Closed, + }, + None => { + DaemonReply::Result(Err("Ignoring event request because no subscribe \ + message was sent yet" + .into())) + } } - }; + }); self.send_reply(&reply)?; } @@ -195,6 +243,19 @@ impl Listener { Ok(()) } + fn report_drop_tokens( + &mut self, + drop_tokens: Vec, + ) -> eyre::Result<()> { + if !drop_tokens.is_empty() { + let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { + tokens: drop_tokens, + }); + self.send_shared_memory_event(drop_event)?; + } + Ok(()) + } + fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { // send NodeEvent to daemon main loop let (reply_tx, reply) = oneshot::channel(); From 77e71393c9a99297965bc508e0e56b7d8948238d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 27 Jan 2023 14:55:42 +0100 Subject: [PATCH 113/225] Add an async receive method --- apis/rust/node/src/daemon.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index a28c1f04..13a51ead 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -154,8 +154,10 @@ fn register( Ok(()) } +type EventItem = (NodeEvent, std::sync::mpsc::Sender<()>); + pub struct EventStream { - receiver: flume::Receiver<(NodeEvent, std::sync::mpsc::Sender<()>)>, + receiver: flume::Receiver, } impl EventStream { @@ -237,7 +239,17 @@ impl EventStream { } pub fn recv(&mut self) -> Option { - let (node_event, drop_sender) = match self.receiver.recv() { + let event = self.receiver.recv(); + self.recv_common(event) + } + + pub async fn recv_async(&mut self) -> Option { + let event = self.receiver.recv_async().await; + self.recv_common(event) + } + + fn recv_common(&mut self, event: Result) -> Option { + let (node_event, drop_sender) = match event { Ok(d) => d, Err(flume::RecvError::Disconnected) => return None, }; From 492339a687b6a2c53468e37af59a982ba9be7e13 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 27 Jan 2023 14:56:49 +0100 Subject: [PATCH 114/225] WIP: Start porting `dora-runtime` to `dora-daemon` --- Cargo.lock | 51 +++++ Cargo.toml | 2 +- binaries/daemon/src/listener.rs | 2 + binaries/daemon/src/shared_mem_handler.rs | 2 +- binaries/runtime/src/lib.rs | 228 +++++++++++++--------- binaries/runtime/src/operator/mod.rs | 88 ++++----- binaries/runtime/src/operator/python.rs | 153 +++++++-------- libraries/core/src/daemon_messages.rs | 8 +- 8 files changed, 317 insertions(+), 217 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0a05d4eb..35a8f903 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1174,6 +1174,37 @@ dependencies = [ "safer-ffi", ] +[[package]] +name = "dora-runtime" +version = "0.1.2" +dependencies = [ + "clap 3.2.20", + "dora-core", + "dora-download", + "dora-message", + "dora-metrics", + "dora-node-api", + "dora-operator-api-python", + "dora-operator-api-types", + "dora-tracing", + "eyre", + "fern", + "flume", + "futures", + "futures-concurrency 2.0.3", + "libloading", + "opentelemetry", + "opentelemetry-system-metrics", + "pyo3", + "serde_yaml 0.8.23", + "tokio", + "tokio-stream", + "tracing", + "tracing-subscriber", + "zenoh", + "zenoh-config", +] + [[package]] name = "dora-tracing" version = "0.1.2" @@ -1254,6 +1285,15 @@ dependencies = [ "instant", ] +[[package]] +name = "fern" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bdd7b0849075e79ee9a1836df22c717d1eba30451796fdc631b04565dd11e2a" +dependencies = [ + "log", +] + [[package]] name = "fixedbitset" version = "0.4.1" @@ -1345,6 +1385,17 @@ dependencies = [ "futures-sink", ] +[[package]] +name = "futures-concurrency" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e98b7b5aedee7c34a5cfb1ee1681af8faf46e2f30c0b8af5ea08eba517d61c" +dependencies = [ + "async-trait", + "futures-core", + "pin-project", +] + [[package]] name = "futures-concurrency" version = "5.0.1" diff --git a/Cargo.toml b/Cargo.toml index 0aef9722..b0ca1b3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ members = [ "binaries/cli", "binaries/coordinator", "binaries/daemon", - # "binaries/runtime", + "binaries/runtime", "examples/rust-dataflow/*", "examples/benchmark/*", "libraries/communication-layer/*", diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 1d2fe73f..93c99f93 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -200,6 +200,8 @@ impl Listener { output_id, metadata, } => { + // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; + // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { dataflow_id: self.dataflow_id, node_id: self.node_id.clone(), diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 86f955ce..8a983ae5 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -64,7 +64,7 @@ impl SharedMemHandler { .merge(); while let Some(event) = events.next().await { let start = Instant::now(); - let event_debug = format!("{event:?}"); + match event { Event::Node(event) => self.handle_node_event(event).await?, Event::Daemon(event) => self.handle_daemon_event(event).await?, diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index fedf559b..ef7a0a87 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -1,16 +1,13 @@ #![warn(unsafe_op_in_unsafe_fn)] use dora_core::{ - config::{CommunicationConfig, DataId, NodeId, OperatorId}, - descriptor::OperatorDefinition, -}; -use dora_node_api::{ - self, - communication::{self, CommunicationLayer, Publisher, STOP_TOPIC}, - manual_stop_publisher, + config::{DataId, OperatorId}, + daemon_messages::RuntimeConfig, }; +use dora_node_api::DoraNode; use eyre::{bail, Context, Result}; use futures::{Stream, StreamExt}; +use futures_concurrency::Merge; use operator::{spawn_operator, OperatorEvent, StopReason}; use std::{ @@ -25,77 +22,85 @@ mod operator; pub fn main() -> eyre::Result<()> { set_up_tracing().context("failed to set up tracing subscriber")?; - let node_id: NodeId = { - let raw = - std::env::var("DORA_NODE_ID").wrap_err("env variable DORA_NODE_ID must be set")?; + let config: RuntimeConfig = { + let raw = std::env::var("DORA_NODE_CONFIG") + .wrap_err("env variable DORA_NODE_CONFIG must be set")?; serde_yaml::from_str(&raw).context("failed to deserialize operator config")? }; - let communication_config: CommunicationConfig = { - let raw = std::env::var("DORA_COMMUNICATION_CONFIG") - .wrap_err("env variable DORA_COMMUNICATION_CONFIG must be set")?; - serde_yaml::from_str(&raw).context("failed to deserialize communication config")? - }; - let operators: Vec = { - let raw = - std::env::var("DORA_OPERATORS").wrap_err("env variable DORA_OPERATORS must be set")?; - serde_yaml::from_str(&raw).context("failed to deserialize operator config")? - }; - - let mut communication: Box = - communication::init(&communication_config)?; + let RuntimeConfig { + node: config, + operators, + } = config; + let node_id = config.node_id.clone(); + let (node, daemon_events) = DoraNode::init(config)?; let mut operator_events = StreamMap::new(); - let mut operator_stop_publishers = HashMap::new(); + // let mut operator_stop_publishers = HashMap::new(); let mut operator_events_tx = HashMap::new(); for operator_config in &operators { let (events_tx, events) = mpsc::channel(1); - let stop_publisher = publisher( - &node_id, - operator_config.id.clone(), - STOP_TOPIC.to_owned().into(), - communication.as_mut(), - ) - .with_context(|| { - format!( - "failed to create stop publisher for operator {}", - operator_config.id - ) - })?; - operator_stop_publishers.insert(operator_config.id.clone(), stop_publisher); + // let stop_publisher = publisher( + // &config.node_id, + // operator_config.id.clone(), + // STOP_TOPIC.to_owned().into(), + // communication.as_mut(), + // ) + // .with_context(|| { + // format!( + // "failed to create stop publisher for operator {}", + // operator_config.id + // ) + // })?; + // operator_stop_publishers.insert(operator_config.id.clone(), stop_publisher); operator_events.insert(operator_config.id.clone(), ReceiverStream::new(events)); operator_events_tx.insert(operator_config.id.clone(), events_tx); } let operator_events = operator_events.map(|(id, event)| Event::Operator { id, event }); + let daemon_events = futures::stream::unfold(daemon_events, |mut stream| async { + let event = stream.recv_async().await.map(|event| match event { + dora_node_api::daemon::Event::Stop => Event::Stop, + dora_node_api::daemon::Event::Input { id, metadata, data } => Event::Input { + id, + metadata, + data: data.map(|data| data.to_owned()), + }, + dora_node_api::daemon::Event::InputClosed { id } => Event::InputClosed(id), + dora_node_api::daemon::Event::Error(err) => Event::Error(err), + _ => todo!(), + }); + event.map(|event| (event, stream)) + }); + let events = (operator_events, daemon_events).merge(); let node_id_clone = node_id.clone(); let tokio_runtime = Builder::new_current_thread() .enable_all() .build() .wrap_err("Could not build a tokio runtime.")?; - let manual_stop_publisher = manual_stop_publisher(communication.as_mut())?; - let stop_thread = std::thread::spawn(move || -> Result<()> { - tokio_runtime.block_on(run( - node_id_clone, - operator_events, - operator_stop_publishers, - manual_stop_publisher, - )) - }); + + let mut operator_channels = HashMap::new(); for operator_config in &operators { let events_tx = operator_events_tx.get(&operator_config.id).unwrap(); + let (operator_tx, incoming_events) = mpsc::channel(10); spawn_operator( &node_id, operator_config.clone(), + incoming_events, events_tx.clone(), - communication.as_mut(), ) .wrap_err_with(|| format!("failed to init operator {}", operator_config.id))?; + + operator_channels.insert(operator_config.id.clone(), operator_tx); } - stop_thread + let main_task = std::thread::spawn(move || -> Result<()> { + tokio_runtime.block_on(run(node, events, operator_channels)) + }); + + main_task .join() .map_err(|err| eyre::eyre!("Stop thread failed with err: {err:#?}"))? .wrap_err("Stop loop thread failed unexpectedly.")?; @@ -103,10 +108,9 @@ pub fn main() -> eyre::Result<()> { } async fn run( - node_id: NodeId, + mut node: DoraNode, mut events: impl Stream + Unpin, - mut operator_stop_publishers: HashMap>, - manual_stop_publisher: Box, + mut operator_channels: HashMap>, ) -> eyre::Result<()> { #[cfg(feature = "metrics")] let _started = { @@ -120,14 +124,17 @@ async fn run( _started }; - let mut stopped_operators = BTreeSet::new(); + // let mut stopped_operators = BTreeSet::new(); while let Some(event) = events.next().await { match event { - Event::Operator { id, event } => { + Event::Operator { + id: operator_id, + event, + } => { match event { OperatorEvent::Error(err) => { - bail!(err.wrap_err(format!("operator {id} failed"))) + bail!(err.wrap_err(format!("operator {operator_id} failed"))) } OperatorEvent::Panic(payload) => std::panic::resume_unwind(payload), OperatorEvent::Finished { reason } => { @@ -137,34 +144,70 @@ async fn run( let data = metadata .serialize() .wrap_err("failed to serialize stop message")?; - manual_stop_publisher - .publish(&data) - .map_err(|err| eyre::eyre!(err)) - .wrap_err("failed to send stop message")?; + // manual_stop_publisher + // .publish(&data) + // .map_err(|err| eyre::eyre!(err)) + // .wrap_err("failed to send stop message")?; break; } - if let Some(stop_publisher) = operator_stop_publishers.remove(&id) { - tracing::info!("operator {node_id}/{id} finished ({reason:?})"); - stopped_operators.insert(id.clone()); - // send stopped message - tokio::task::spawn_blocking(move || stop_publisher.publish(&[])) - .await - .wrap_err("failed to join stop publish task")? - .map_err(|err| eyre::eyre!(err)) - .with_context(|| { - format!( - "failed to send stop message for operator `{node_id}/{id}`" - ) - })?; - if operator_stop_publishers.is_empty() { - break; - } - } else { - tracing::warn!("no stop publisher for {id}"); - } + // if let Some(stop_publisher) = operator_stop_publishers.remove(&id) { + // tracing::info!("operator {node_id}/{id} finished ({reason:?})"); + // stopped_operators.insert(id.clone()); + // // send stopped message + // tokio::task::spawn_blocking(move || stop_publisher.publish(&[])) + // .await + // .wrap_err("failed to join stop publish task")? + // .map_err(|err| eyre::eyre!(err)) + // .with_context(|| { + // format!( + // "failed to send stop message for operator `{node_id}/{id}`" + // ) + // })?; + // if operator_stop_publishers.is_empty() { + // break; + // } + // } else { + // tracing::warn!("no stop publisher for {id}"); + // } + } + OperatorEvent::Output { + output_id, + metadata, + data, + } => { + let output_id = DataId::from(format!("{operator_id}/{output_id}")); + node.send_output(output_id, metadata, data.len(), |buf| { + buf.copy_from_slice(&data); + }); } } } + Event::Stop => { + // forward stop event to all operators + for channel in operator_channels.values_mut() { + channel.send(operator::IncomingEvent::Stop).await; + } + } + Event::Input { id, metadata, data } => { + let Some((operator_id, input_id)) = id.as_str().split_once('/') else { + tracing::warn!("received non-operator input {id}"); + continue; + }; + let operator_id = OperatorId::from(operator_id.to_owned()); + let input_id = DataId::from(input_id.to_owned()); + let Some(operator_channel) = operator_channels.get(&operator_id) else { + tracing::warn!("received input {id} for unknown operator"); + continue; + }; + + operator_channel.send(operator::IncomingEvent::Input { + input_id, + metadata, + data, + }); + } + Event::InputClosed(_) => todo!(), + Event::Error(_) => todo!(), } } @@ -173,24 +216,31 @@ async fn run( Ok(()) } -fn publisher( - self_id: &NodeId, - operator_id: OperatorId, - output_id: DataId, - communication: &mut dyn CommunicationLayer, -) -> eyre::Result> { - let topic = format!("{self_id}/{operator_id}/{output_id}"); - communication - .publisher(&topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed to create publisher for output {output_id}")) -} +// fn publisher( +// self_id: &NodeId, +// operator_id: OperatorId, +// output_id: DataId, +// ) -> eyre::Result> { +// let topic = format!("{self_id}/{operator_id}/{output_id}"); +// communication +// .publisher(&topic) +// .map_err(|err| eyre::eyre!(err)) +// .wrap_err_with(|| format!("failed to create publisher for output {output_id}")) +// } enum Event { Operator { id: OperatorId, event: OperatorEvent, }, + Stop, + Input { + id: dora_core::config::DataId, + metadata: dora_message::Metadata<'static>, + data: Option>, + }, + InputClosed(dora_core::config::DataId), + Error(String), } fn set_up_tracing() -> eyre::Result<()> { diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index 7429f64f..0d0a2b4a 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -1,52 +1,26 @@ use dora_core::{ - config::NodeId, + config::{DataId, NodeId}, descriptor::{OperatorDefinition, OperatorSource}, }; -use dora_node_api::communication::{self, CommunicationLayer}; +use dora_message::{Metadata, MetadataParameters}; use eyre::Context; #[cfg(feature = "tracing")] use opentelemetry::sdk::trace::Tracer; use std::any::Any; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::{Receiver, Sender}; #[cfg(not(feature = "tracing"))] type Tracer = (); mod python; -mod shared_lib; +// mod shared_lib; -#[tracing::instrument(skip(communication))] pub fn spawn_operator( node_id: &NodeId, operator_definition: OperatorDefinition, + incoming_events: Receiver, events_tx: Sender, - communication: &mut dyn CommunicationLayer, ) -> eyre::Result<()> { - let inputs = communication::subscribe_all(communication, &operator_definition.config.inputs) - .wrap_err_with(|| { - format!( - "failed to subscribe to inputs of operator {}", - operator_definition.id - ) - })?; - - let publishers = operator_definition - .config - .outputs - .iter() - .map(|output_id| { - let topic = format!( - "{node_id}/{operator_id}/{output_id}", - operator_id = operator_definition.id - ); - communication - .publisher(&topic) - .map_err(|err| eyre::eyre!(err)) - .wrap_err_with(|| format!("failed to create publisher for output {output_id}")) - .map(|p| (output_id.to_owned(), p)) - }) - .collect::>()?; - #[cfg(feature = "tracing")] let tracer = dora_tracing::init_tracing(format!("{node_id}/{}", operator_definition.id).as_str()) @@ -57,21 +31,22 @@ pub fn spawn_operator( match &operator_definition.config.source { OperatorSource::SharedLibrary(source) => { - shared_lib::spawn( - node_id, - &operator_definition.id, - source, - events_tx, - inputs, - publishers, - tracer, - ) - .wrap_err_with(|| { - format!( - "failed to spawn shared library operator for {}", - operator_definition.id - ) - })?; + // shared_lib::spawn( + // node_id, + // &operator_definition.id, + // source, + // events_tx, + // input_events, + // publishers, + // tracer, + // ) + // .wrap_err_with(|| { + // format!( + // "failed to spawn shared library operator for {}", + // operator_definition.id + // ) + // })?; + todo!() } OperatorSource::Python(source) => { python::spawn( @@ -79,8 +54,7 @@ pub fn spawn_operator( &operator_definition.id, source, events_tx, - inputs, - publishers, + incoming_events, tracer, ) .wrap_err_with(|| { @@ -98,9 +72,25 @@ pub fn spawn_operator( } pub enum OperatorEvent { + Output { + output_id: DataId, + metadata: MetadataParameters<'static>, + data: Vec, + }, Error(eyre::Error), Panic(Box), - Finished { reason: StopReason }, + Finished { + reason: StopReason, + }, +} + +pub enum IncomingEvent { + Stop, + Input { + input_id: DataId, + metadata: Metadata<'static>, + data: Option>, + }, } #[derive(Debug)] diff --git a/binaries/runtime/src/operator/python.rs b/binaries/runtime/src/operator/python.rs index b4849cca..2a38879b 100644 --- a/binaries/runtime/src/operator/python.rs +++ b/binaries/runtime/src/operator/python.rs @@ -1,13 +1,11 @@ #![allow(clippy::borrow_deref_ref)] // clippy warns about code generated by #[pymethods] -use super::{OperatorEvent, StopReason, Tracer}; +use super::{IncomingEvent, OperatorEvent, StopReason, Tracer}; use dora_core::{ - config::{DataId, NodeId, OperatorId}, + config::{NodeId, OperatorId}, descriptor::source_is_url, }; use dora_download::download_file; -use dora_message::uhlc; -use dora_node_api::communication::Publisher; use dora_operator_api_python::metadata_to_pydict; use dora_operator_api_types::DoraStatus; use eyre::{bail, eyre, Context}; @@ -19,12 +17,10 @@ use pyo3::{ }; use std::{ borrow::Cow, - collections::HashMap, panic::{catch_unwind, AssertUnwindSafe}, path::Path, - sync::Arc, }; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::{Receiver, Sender}; fn traceback(err: pyo3::PyErr) -> eyre::Report { Python::with_gil(|py| { @@ -43,8 +39,7 @@ pub fn spawn( operator_id: &OperatorId, source: &str, events_tx: Sender, - inputs: flume::Receiver, - publishers: HashMap>, + mut incoming_events: Receiver, tracer: Tracer, ) -> eyre::Result<()> { let path = if source_is_url(source) { @@ -71,8 +66,7 @@ pub fn spawn( let path_cloned = path.clone(); let send_output = SendOutputCallback { - publishers: Arc::new(publishers), - hlc: Arc::new(uhlc::HLC::default()), + events_tx: events_tx.clone(), }; let init_operator = move |py: Python| { @@ -114,49 +108,60 @@ pub fn spawn( Python::with_gil(init_operator).wrap_err("failed to init python operator")?; let reason = loop { - let Ok(mut input) = inputs.recv() else {break StopReason::InputsClosed }; - - #[cfg(feature = "tracing")] - let (_child_cx, string_cx) = { - use dora_tracing::{deserialize_context, serialize_context}; - use opentelemetry::trace::TraceContextExt; - use opentelemetry::{trace::Tracer, Context as OtelContext}; - - let cx = deserialize_context(&input.metadata.parameters.open_telemetry_context); - let span = tracer.start_with_context(format!("{}", input.id), &cx); - - let child_cx = OtelContext::current_with_span(span); - let string_cx = serialize_context(&child_cx); - (child_cx, string_cx) - }; - - #[cfg(not(feature = "tracing"))] - let string_cx = { - let () = tracer; - "".to_string() - }; - input.metadata.parameters.open_telemetry_context = Cow::Owned(string_cx); - - let status_enum = Python::with_gil(|py| { - let input_dict = PyDict::new(py); - - input_dict.set_item("id", input.id.as_str())?; - input_dict.set_item("data", PyBytes::new(py, &input.data()))?; - input_dict.set_item("metadata", metadata_to_pydict(input.metadata(), py))?; - - operator - .call_method1(py, "on_input", (input_dict, send_output.clone())) - .map_err(traceback) - })?; - let status_val = Python::with_gil(|py| status_enum.getattr(py, "value")) - .wrap_err("on_input must have enum return value")?; - let status: i32 = Python::with_gil(|py| status_val.extract(py)) - .wrap_err("on_input has invalid return value")?; - match status { - s if s == DoraStatus::Continue as i32 => {} // ok - s if s == DoraStatus::Stop as i32 => break StopReason::ExplicitStop, - s if s == DoraStatus::StopAll as i32 => break StopReason::ExplicitStopAll, - other => bail!("on_input returned invalid status {other}"), + let Some(event) = incoming_events.blocking_recv() else { break StopReason::InputsClosed }; + + match event { + IncomingEvent::Input { + input_id, + mut metadata, + data, + } => { + #[cfg(feature = "tracing")] + let (_child_cx, string_cx) = { + use dora_tracing::{deserialize_context, serialize_context}; + use opentelemetry::trace::TraceContextExt; + use opentelemetry::{trace::Tracer, Context as OtelContext}; + + let cx = deserialize_context(&metadata.parameters.open_telemetry_context); + let span = tracer.start_with_context(format!("{}", input_id), &cx); + + let child_cx = OtelContext::current_with_span(span); + let string_cx = serialize_context(&child_cx); + (child_cx, string_cx) + }; + + #[cfg(not(feature = "tracing"))] + let string_cx = { + let () = tracer; + "".to_string() + }; + metadata.parameters.open_telemetry_context = Cow::Owned(string_cx); + + let status_enum = Python::with_gil(|py| { + let input_dict = PyDict::new(py); + + input_dict.set_item("id", input_id.as_str())?; + if let Some(data) = data { + input_dict.set_item("data", PyBytes::new(py, &data))?; + } + input_dict.set_item("metadata", metadata_to_pydict(&metadata, py))?; + + operator + .call_method1(py, "on_input", (input_dict, send_output.clone())) + .map_err(traceback) + })?; + let status_val = Python::with_gil(|py| status_enum.getattr(py, "value")) + .wrap_err("on_input must have enum return value")?; + let status: i32 = Python::with_gil(|py| status_val.extract(py)) + .wrap_err("on_input has invalid return value")?; + match status { + s if s == DoraStatus::Continue as i32 => {} // ok + s if s == DoraStatus::Stop as i32 => break StopReason::ExplicitStop, + s if s == DoraStatus::StopAll as i32 => break StopReason::ExplicitStopAll, + other => bail!("on_input returned invalid status {other}"), + } + } + IncomingEvent::Stop => {} } }; @@ -192,15 +197,15 @@ pub fn spawn( #[pyclass] #[derive(Clone)] struct SendOutputCallback { - publishers: Arc>>, - hlc: Arc, + events_tx: Sender, } #[allow(unsafe_op_in_unsafe_fn)] mod callback_impl { + use crate::operator::OperatorEvent; + use super::SendOutputCallback; - use dora_message::Metadata; use dora_operator_api_python::pydict_to_metadata; use eyre::{eyre, Context, Result}; use pyo3::{ @@ -217,25 +222,21 @@ mod callback_impl { metadata: Option<&PyDict>, ) -> Result<()> { let data = data.as_bytes(); - let parameters = pydict_to_metadata(metadata).wrap_err("Could not parse metadata.")?; - let metadata = Metadata::from_parameters(self.hlc.new_timestamp(), parameters); - let mut message = metadata - .serialize() - .context(format!("failed to serialize `{}` metadata", output))?; - - match self.publishers.get(output) { - Some(publisher) => { - message.extend_from_slice(data); - - publisher - .publish(&message) - .map_err(|err| eyre::eyre!(err)) - .context("publish failed") - } - None => Err(eyre!( - "unexpected output {output} (not defined in dataflow config)" - )), - } + let metadata = pydict_to_metadata(metadata) + .wrap_err("Could not parse metadata.")? + .into_owned(); + + let event = OperatorEvent::Output { + output_id: output.to_owned().into(), + metadata, + data: data.to_owned(), + }; + + self.events_tx + .blocking_send(event) + .map_err(|_| eyre!("failed to send output to runtime"))?; + + Ok(()) } } } diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 25ab4f17..f754b65c 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -2,7 +2,7 @@ use std::{collections::BTreeMap, path::PathBuf}; use crate::{ config::{DataId, NodeId, NodeRunConfig}, - descriptor, + descriptor::{self, OperatorDefinition}, }; use dora_message::Metadata; use uuid::Uuid; @@ -16,6 +16,12 @@ pub struct NodeConfig { pub daemon_events_region_id: SharedMemoryId, } +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct RuntimeConfig { + pub node: NodeConfig, + pub operators: Vec, +} + #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum DaemonRequest { Register { From d6470e75a3a82b37bb0a76149521d1a3727181fb Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 11:14:27 +0100 Subject: [PATCH 115/225] Don't use blocking output send function in runtime loop --- binaries/runtime/src/lib.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index ef7a0a87..ede85aeb 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -176,9 +176,15 @@ async fn run( data, } => { let output_id = DataId::from(format!("{operator_id}/{output_id}")); - node.send_output(output_id, metadata, data.len(), |buf| { - buf.copy_from_slice(&data); - }); + let result; + (node, result) = tokio::task::spawn_blocking(move || { + let result = node.send_output(output_id, metadata, data.len(), |buf| { + buf.copy_from_slice(&data); + }); + (node, result) + }) + .await?; + result.wrap_err("failed to send node output")?; } } } From 6eb9b46637fa4db7e5af416ad14c5561bd3e1996 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 11:30:29 +0100 Subject: [PATCH 116/225] Close outputs of operators when they stop We cannot wait until all operators of the runtime node are finished because operators might be subscribed to other operators. --- apis/rust/node/src/daemon.rs | 14 +++++ apis/rust/node/src/lib.rs | 14 +++++ binaries/daemon/src/lib.rs | 76 ++++++++++++++++++--------- binaries/daemon/src/listener.rs | 3 ++ binaries/runtime/src/lib.rs | 49 +++++++++-------- libraries/core/src/daemon_messages.rs | 1 + 6 files changed, 110 insertions(+), 47 deletions(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index 13a51ead..ed137a41 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -73,6 +73,20 @@ impl ControlChannel { Ok(()) } + pub fn report_closed_outputs(&mut self, outputs: Vec) -> eyre::Result<()> { + let reply = self + .channel + .request(&DaemonRequest::CloseOutputs(outputs)) + .wrap_err("failed to report closed outputs to dora-daemon")?; + match reply { + dora_core::daemon_messages::DaemonReply::Result(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("failed to receive closed outputs reply from dora-daemon")?, + other => bail!("unexpected closed outputs reply: {other:?}"), + } + Ok(()) + } + pub fn prepare_message( &mut self, output_id: DataId, diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 92b11432..d21ed053 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -107,6 +107,20 @@ impl DoraNode { Ok(()) } + pub fn close_outputs(&mut self, outputs: Vec) -> eyre::Result<()> { + for output_id in &outputs { + if !self.node_config.outputs.remove(output_id) { + eyre::bail!("unknown output {output_id}"); + } + } + + self.control_channel + .report_closed_outputs(outputs) + .wrap_err("failed to report closed outputs to daemon")?; + + Ok(()) + } + pub fn id(&self) -> &NodeId { &self.id } diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index e0fb514c..cd3e39d5 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -360,6 +360,24 @@ impl Daemon { }; let _ = reply_sender.send(DaemonReply::Result(result)); } + DaemonNodeEvent::CloseOutputs(outputs) => { + // notify downstream nodes + let inner = async { + let dataflow = self + .running + .get_mut(&dataflow_id) + .wrap_err_with(|| format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`"))?; + send_input_closed_events(dataflow, |(source_id, output_id)| { + source_id == &node_id && outputs.contains(output_id) + }) + .await; + Result::<_, eyre::Error>::Ok(()) + }; + + let reply = inner.await.map_err(|err| format!("{err:?}")); + let _ = reply_sender.send(DaemonReply::Result(reply)); + // TODO: notify remote nodes + } DaemonNodeEvent::Stopped => { tracing::info!("Stopped: {dataflow_id}/{node_id}"); @@ -370,31 +388,7 @@ impl Daemon { .running .get_mut(&dataflow_id) .wrap_err_with(|| format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`"))?; - let downstream_nodes: BTreeSet<_> = dataflow - .mappings - .iter() - .filter(|((source_id, _), _)| source_id == &node_id) - .flat_map(|(_, v)| v) - .collect(); - for (receiver_id, input_id) in downstream_nodes { - let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { - continue; - }; - - let _ = channel - .send_async(daemon_messages::NodeEvent::InputClosed { - id: input_id.clone(), - }) - .await; - - if let Some(open_inputs) = dataflow.open_inputs.get_mut(receiver_id) { - open_inputs.remove(input_id); - if open_inputs.is_empty() { - // close the subscriber channel - dataflow.subscribe_channels.remove(receiver_id); - } - } - } + send_input_closed_events(dataflow, |(source_id, _)| source_id == &node_id).await; // TODO: notify remote nodes @@ -602,6 +596,37 @@ impl Daemon { } } +async fn send_input_closed_events(dataflow: &mut RunningDataflow, mut filter: F) +where + F: FnMut(&(NodeId, DataId)) -> bool, +{ + let downstream_nodes: BTreeSet<_> = dataflow + .mappings + .iter() + .filter(|(k, _)| filter(k)) + .flat_map(|(_, v)| v) + .collect(); + for (receiver_id, input_id) in downstream_nodes { + let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { + continue; + }; + + let _ = channel + .send_async(daemon_messages::NodeEvent::InputClosed { + id: input_id.clone(), + }) + .await; + + if let Some(open_inputs) = dataflow.open_inputs.get_mut(receiver_id) { + open_inputs.remove(input_id); + if open_inputs.is_empty() { + // close the subscriber channel + dataflow.subscribe_channels.remove(receiver_id); + } + } + } +} + #[derive(Default)] pub struct RunningDataflow { subscribe_channels: HashMap>, @@ -647,6 +672,7 @@ pub enum DaemonNodeEvent { Subscribe { event_sender: flume::Sender, }, + CloseOutputs(Vec), } pub enum ShmemHandlerEvent { diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 93c99f93..9ddf605c 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -165,6 +165,9 @@ impl Listener { self.send_reply(&reply)?; } DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped)?, + DaemonRequest::CloseOutputs(outputs) => { + self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs))? + } DaemonRequest::PrepareOutputMessage { output_id, metadata, diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index ede85aeb..936e1291 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -3,6 +3,7 @@ use dora_core::{ config::{DataId, OperatorId}, daemon_messages::RuntimeConfig, + descriptor::OperatorConfig, }; use dora_node_api::DoraNode; use eyre::{bail, Context, Result}; @@ -74,7 +75,6 @@ pub fn main() -> eyre::Result<()> { event.map(|event| (event, stream)) }); let events = (operator_events, daemon_events).merge(); - let node_id_clone = node_id.clone(); let tokio_runtime = Builder::new_current_thread() .enable_all() .build() @@ -96,8 +96,9 @@ pub fn main() -> eyre::Result<()> { operator_channels.insert(operator_config.id.clone(), operator_tx); } + let operator_config = operators.into_iter().map(|c| (c.id, c.config)).collect(); let main_task = std::thread::spawn(move || -> Result<()> { - tokio_runtime.block_on(run(node, events, operator_channels)) + tokio_runtime.block_on(run(node, operator_config, events, operator_channels)) }); main_task @@ -109,6 +110,7 @@ pub fn main() -> eyre::Result<()> { async fn run( mut node: DoraNode, + operators: HashMap, mut events: impl Stream + Unpin, mut operator_channels: HashMap>, ) -> eyre::Result<()> { @@ -144,38 +146,37 @@ async fn run( let data = metadata .serialize() .wrap_err("failed to serialize stop message")?; + todo!("instruct dora-daemon/dora-coordinator to stop other nodes"); // manual_stop_publisher // .publish(&data) // .map_err(|err| eyre::eyre!(err)) // .wrap_err("failed to send stop message")?; break; } - // if let Some(stop_publisher) = operator_stop_publishers.remove(&id) { - // tracing::info!("operator {node_id}/{id} finished ({reason:?})"); - // stopped_operators.insert(id.clone()); - // // send stopped message - // tokio::task::spawn_blocking(move || stop_publisher.publish(&[])) - // .await - // .wrap_err("failed to join stop publish task")? - // .map_err(|err| eyre::eyre!(err)) - // .with_context(|| { - // format!( - // "failed to send stop message for operator `{node_id}/{id}`" - // ) - // })?; - // if operator_stop_publishers.is_empty() { - // break; - // } - // } else { - // tracing::warn!("no stop publisher for {id}"); - // } + + let Some(config) = operators.get(&operator_id) else { + tracing::warn!("received Finished event for unknown operator `{operator_id}`"); + continue; + }; + let outputs = config + .outputs + .iter() + .map(|output_id| operator_output_id(&operator_id, output_id)) + .collect(); + let result; + (node, result) = tokio::task::spawn_blocking(move || { + let result = node.close_outputs(outputs); + (node, result) + }) + .await?; + result.wrap_err("failed to close outputs of finished operator")?; } OperatorEvent::Output { output_id, metadata, data, } => { - let output_id = DataId::from(format!("{operator_id}/{output_id}")); + let output_id = operator_output_id(&operator_id, &output_id); let result; (node, result) = tokio::task::spawn_blocking(move || { let result = node.send_output(output_id, metadata, data.len(), |buf| { @@ -222,6 +223,10 @@ async fn run( Ok(()) } +fn operator_output_id(operator_id: &OperatorId, output_id: &DataId) -> DataId { + DataId::from(format!("{operator_id}/{output_id}")) +} + // fn publisher( // self_id: &NodeId, // operator_id: OperatorId, diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index f754b65c..a1789745 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -41,6 +41,7 @@ pub enum DaemonRequest { output_id: DataId, metadata: Metadata<'static>, }, + CloseOutputs(Vec), Stopped, NextEvent { drop_tokens: Vec, From 00b863fdc96adcc73bd00f67d6c18fb6bb4e7f55 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 11:33:27 +0100 Subject: [PATCH 117/225] Fix some warnings and remove some unused code --- binaries/runtime/src/lib.rs | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 936e1291..749f51cb 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -11,10 +11,7 @@ use futures::{Stream, StreamExt}; use futures_concurrency::Merge; use operator::{spawn_operator, OperatorEvent, StopReason}; -use std::{ - collections::{BTreeSet, HashMap}, - mem, -}; +use std::{collections::HashMap, mem}; use tokio::{runtime::Builder, sync::mpsc}; use tokio_stream::{wrappers::ReceiverStream, StreamMap}; @@ -207,7 +204,7 @@ async fn run( continue; }; - operator_channel.send(operator::IncomingEvent::Input { + let _ = operator_channel.send(operator::IncomingEvent::Input { input_id, metadata, data, @@ -227,18 +224,6 @@ fn operator_output_id(operator_id: &OperatorId, output_id: &DataId) -> DataId { DataId::from(format!("{operator_id}/{output_id}")) } -// fn publisher( -// self_id: &NodeId, -// operator_id: OperatorId, -// output_id: DataId, -// ) -> eyre::Result> { -// let topic = format!("{self_id}/{operator_id}/{output_id}"); -// communication -// .publisher(&topic) -// .map_err(|err| eyre::eyre!(err)) -// .wrap_err_with(|| format!("failed to create publisher for output {output_id}")) -// } - enum Event { Operator { id: OperatorId, From 4bd235afd7b846e1486041d2b83e3bdedb2eb031 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 11:33:46 +0100 Subject: [PATCH 118/225] Close event channels after sending Stop --- binaries/runtime/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 749f51cb..20d5af2e 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -187,9 +187,9 @@ async fn run( } } Event::Stop => { - // forward stop event to all operators - for channel in operator_channels.values_mut() { - channel.send(operator::IncomingEvent::Stop).await; + // forward stop event to all operators and close the event channels + for (_, channel) in operator_channels.drain() { + let _ = channel.send(operator::IncomingEvent::Stop).await; } } Event::Input { id, metadata, data } => { From 27d74a7a71f1624a0a8e4fb3b1c52a442f6ee01a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 11:34:01 +0100 Subject: [PATCH 119/225] Resolve some todos --- binaries/runtime/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 20d5af2e..cf2bf84b 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -210,8 +210,8 @@ async fn run( data, }); } - Event::InputClosed(_) => todo!(), - Event::Error(_) => todo!(), + Event::InputClosed(_) => {} + Event::Error(err) => eyre::bail!("received error event: {err}"), } } From 8cbc891251123cc6469af4534047b4c825be8ace Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 17:21:01 +0100 Subject: [PATCH 120/225] Add start runtime function to python node api again --- Cargo.lock | 1 + Cargo.toml | 4 ++++ apis/python/node/Cargo.toml | 2 +- apis/python/node/src/lib.rs | 16 ++++++++-------- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 35a8f903..6f8d0ca3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1111,6 +1111,7 @@ version = "0.1.2" dependencies = [ "dora-node-api", "dora-operator-api-python", + "dora-runtime", "eyre", "flume", "pyo3", diff --git a/Cargo.toml b/Cargo.toml index b0ca1b3c..30148c22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,10 @@ path = "examples/c++-dataflow/run.rs" name = "python-dataflow" path = "examples/python-dataflow/run.rs" +[[example]] +name = "python-operator-dataflow" +path = "examples/python-operator-dataflow/run.rs" + [[example]] name = "benchmark" path = "examples/benchmark/run.rs" diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index 993d206d..fdffc278 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -13,7 +13,7 @@ pyo3 = { version = "0.16", features = ["eyre", "abi3-py37"] } eyre = "0.6" serde_yaml = "0.8.23" flume = "0.10.14" -# dora-runtime = { path = "../../../binaries/runtime" } +dora-runtime = { path = "../../../binaries/runtime" } [lib] name = "dora" diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 8e2be28c..e083c73d 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -105,17 +105,17 @@ impl Node { } } -// #[pyfunction] -// fn start_runtime() -> Result<()> { -// dora_runtime::main() -// .wrap_err("Python Dora Runtime failed.") -// .unwrap(); -// Ok(()) -// } +#[pyfunction] +fn start_runtime() -> Result<()> { + dora_runtime::main() + .wrap_err("Python Dora Runtime failed.") + .unwrap(); + Ok(()) +} #[pymodule] fn dora(_py: Python, m: &PyModule) -> PyResult<()> { - // m.add_function(wrap_pyfunction!(start_runtime, m)?)?; + m.add_function(wrap_pyfunction!(start_runtime, m)?)?; m.add_class::().unwrap(); Ok(()) } From 44b1f770359b872c249ab80c511a82acf95bbd63 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 17:23:12 +0100 Subject: [PATCH 121/225] Remove first-class operator support from input mappings Treat them like normal input IDs with a slash in their name in most cases. --- libraries/core/src/config.rs | 48 ++++++++-------------- libraries/core/src/descriptor/mod.rs | 10 ----- libraries/core/src/descriptor/visualize.rs | 18 ++++---- 3 files changed, 23 insertions(+), 53 deletions(-) diff --git a/libraries/core/src/config.rs b/libraries/core/src/config.rs index 2a880802..e1ecadbe 100644 --- a/libraries/core/src/config.rs +++ b/libraries/core/src/config.rs @@ -55,6 +55,12 @@ impl std::fmt::Display for OperatorId { } } +impl AsRef for OperatorId { + fn as_ref(&self) -> &str { + &self.0 + } +} + #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] pub struct DataId(String); @@ -123,13 +129,6 @@ impl InputMapping { InputMapping::Timer { .. } => DORA_NODE_ID.get_or_init(|| NodeId("dora".to_string())), } } - - pub fn operator(&self) -> &Option { - match self { - InputMapping::User(mapping) => &mapping.operator, - InputMapping::Timer { .. } => &None, - } - } } impl fmt::Display for InputMapping { @@ -140,11 +139,7 @@ impl fmt::Display for InputMapping { write!(f, "dora/timer/{duration}") } InputMapping::User(mapping) => { - if let Some(operator) = &mapping.operator { - write!(f, "{}/{operator}/{}", mapping.source, mapping.output) - } else { - write!(f, "{}/{}", mapping.source, mapping.output) - } + write!(f, "{}/{}", mapping.source, mapping.output) } } } @@ -165,18 +160,13 @@ impl<'de> Deserialize<'de> for InputMapping { D: serde::Deserializer<'de>, { let string = String::deserialize(deserializer)?; - let (source, rest) = string + let (source, output) = string .split_once('/') .ok_or_else(|| serde::de::Error::custom("input must start with `/`"))?; - let (operator, output) = rest - .split_once('/') - .map(|(op, out)| (Some(op), out)) - .unwrap_or((None, rest)); - - let deserialized = match source { - "dora" => match operator { - Some("timer") => { + let deserialized = if let Some(dora_output) = source.strip_prefix("dora/") { + match dora_output { + "timer" => { let (unit, value) = output.split_once('/').ok_or_else(|| { serde::de::Error::custom( "timer input must specify unit and value (e.g. `secs/5` or `millis/100`)", @@ -207,22 +197,17 @@ impl<'de> Deserialize<'de> for InputMapping { }; Self::Timer { interval } } - Some(other) => { + other => { return Err(serde::de::Error::custom(format!( "unknown dora input `{other}`" ))) } - None => { - return Err(serde::de::Error::custom( - "dora input has invalid format".to_string(), - )) - } - }, - _ => Self::User(UserInputMapping { + } + } else { + Self::User(UserInputMapping { source: source.to_owned().into(), - operator: operator.map(|o| o.to_owned().into()), output: output.to_owned().into(), - }), + }) }; Ok(deserialized) @@ -232,7 +217,6 @@ impl<'de> Deserialize<'de> for InputMapping { #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct UserInputMapping { pub source: NodeId, - pub operator: Option, pub output: DataId, } diff --git a/libraries/core/src/descriptor/mod.rs b/libraries/core/src/descriptor/mod.rs index 53f614a9..558c215b 100644 --- a/libraries/core/src/descriptor/mod.rs +++ b/libraries/core/src/descriptor/mod.rs @@ -46,16 +46,6 @@ impl Descriptor { NodeKind::Custom(node) => node.run_config.inputs.values_mut().collect(), NodeKind::Operator(operator) => operator.config.inputs.values_mut().collect(), }; - for mapping in input_mappings.into_iter().filter_map(|m| match m { - InputMapping::Timer { .. } => None, - InputMapping::User(m) => Some(m), - }) { - if let Some(op_name) = single_operator_nodes.get(&mapping.source).copied() { - if mapping.operator.is_none() { - mapping.operator = Some(op_name.to_owned()); - } - } - } // resolve nodes let kind = match node.kind { diff --git a/libraries/core/src/descriptor/visualize.rs b/libraries/core/src/descriptor/visualize.rs index fdcd1ebb..ce595363 100644 --- a/libraries/core/src/descriptor/visualize.rs +++ b/libraries/core/src/descriptor/visualize.rs @@ -162,15 +162,11 @@ fn visualize_user_mapping( input_id: &DataId, flowchart: &mut String, ) { - let UserInputMapping { - source, - operator, - output, - } = mapping; + let UserInputMapping { source, output } = mapping; let mut source_found = false; if let Some(source_node) = nodes.get(source) { - match (&source_node.kind, operator) { - (CoreNodeKind::Custom(custom_node), None) => { + match &source_node.kind { + CoreNodeKind::Custom(custom_node) => { if custom_node.run_config.outputs.contains(output) { let data = if output == input_id { format!("{output}") @@ -181,10 +177,11 @@ fn visualize_user_mapping( source_found = true; } } - (CoreNodeKind::Runtime(RuntimeNode { operators, .. }), Some(operator_id)) => { - if let Some(operator) = operators.iter().find(|o| &o.id == operator_id) { + CoreNodeKind::Runtime(RuntimeNode { operators, .. }) => { + let (operator_id, output) = output.split_once('/').unwrap_or(("", output)); + if let Some(operator) = operators.iter().find(|o| o.id.as_ref() == operator_id) { if operator.config.outputs.contains(output) { - let data = if output == input_id { + let data = if output == input_id.as_str() { format!("{output}") } else { format!("{output} as {input_id}") @@ -195,7 +192,6 @@ fn visualize_user_mapping( } } } - (CoreNodeKind::Custom(_), Some(_)) | (CoreNodeKind::Runtime(_), None) => {} } } if !source_found { From cac527a3821ab4c80efb884d55b8429921e8e47d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 17:26:19 +0100 Subject: [PATCH 122/225] Add support for spawning runtime nodes with operators --- binaries/coordinator/src/run/mod.rs | 24 +---- binaries/daemon/src/lib.rs | 96 +++++++++++------ binaries/daemon/src/spawn.rs | 146 ++++++++++++++++++-------- binaries/runtime/src/lib.rs | 4 +- libraries/core/src/daemon_messages.rs | 10 +- 5 files changed, 171 insertions(+), 109 deletions(-) diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 894ca2f3..f3da0ecf 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -2,9 +2,7 @@ use crate::tcp_utils::{tcp_receive, tcp_send}; use dora_core::{ config::{CommunicationConfig, NodeId}, - daemon_messages::{ - DaemonCoordinatorEvent, DaemonCoordinatorReply, SpawnDataflowNodes, SpawnNodeParams, - }, + daemon_messages::{DaemonCoordinatorEvent, DaemonCoordinatorReply, SpawnDataflowNodes}, descriptor::{CoreNodeKind, Descriptor}, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; @@ -63,26 +61,10 @@ pub async fn spawn_dataflow( } } - let mut custom_nodes = BTreeMap::new(); - for node in nodes { - match node.kind { - CoreNodeKind::Runtime(_) => todo!(), - CoreNodeKind::Custom(n) => { - custom_nodes.insert( - node.id.clone(), - SpawnNodeParams { - node_id: node.id, - node: n, - working_dir: working_dir.clone(), - }, - ); - } - } - } - let spawn_command = SpawnDataflowNodes { dataflow_id: uuid, - nodes: custom_nodes, + working_dir, + nodes, }; let message = serde_json::to_vec(&DaemonCoordinatorEvent::Spawn(spawn_command))?; diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index cd3e39d5..249748b4 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -4,9 +4,9 @@ use dora_core::{ coordinator_messages::DaemonEvent, daemon_messages::{ self, DaemonCoordinatorEvent, DaemonCoordinatorReply, DaemonReply, DataflowId, DropToken, - SpawnDataflowNodes, SpawnNodeParams, + SpawnDataflowNodes, }, - descriptor::{CoreNodeKind, Descriptor}, + descriptor::{CoreNodeKind, Descriptor, ResolvedNode}, }; use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; @@ -17,7 +17,7 @@ use std::{ collections::{BTreeMap, BTreeSet, HashMap}, fmt, net::SocketAddr, - path::Path, + path::{Path, PathBuf}, time::{Duration, Instant}, }; use tcp_utils::tcp_receive; @@ -69,32 +69,17 @@ impl Daemon { .to_owned(); let nodes = read_descriptor(dataflow_path).await?.resolve_aliases(); - let mut custom_nodes = BTreeMap::new(); - for node in nodes { - match node.kind { - CoreNodeKind::Runtime(_) => todo!(), - CoreNodeKind::Custom(n) => { - custom_nodes.insert( - node.id.clone(), - SpawnNodeParams { - node_id: node.id, - node: n, - working_dir: working_dir.clone(), - }, - ); - } - } - } let spawn_command = SpawnDataflowNodes { dataflow_id: Uuid::new_v4(), - nodes: custom_nodes, + working_dir, + nodes, }; let exit_when_done = spawn_command .nodes .iter() - .map(|(id, _)| (spawn_command.dataflow_id, id.clone())) + .map(|n| (spawn_command.dataflow_id, n.id.clone())) .collect(); let (reply_tx, reply_rx) = oneshot::channel(); let coordinator_events = stream::once(async move { @@ -229,8 +214,15 @@ impl Daemon { event: DaemonCoordinatorEvent, ) -> (DaemonCoordinatorReply, RunStatus) { match event { - DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { dataflow_id, nodes }) => { - let result = self.spawn_dataflow(dataflow_id, nodes).await; + DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { + dataflow_id, + working_dir, + nodes, + }) => { + let result = self.spawn_dataflow(dataflow_id, working_dir, nodes).await; + if let Err(err) = &result { + tracing::error!("{err:?}"); + } let reply = DaemonCoordinatorReply::SpawnResult(result.map_err(|err| format!("{err:?}"))); (reply, RunStatus::Continue) @@ -266,7 +258,8 @@ impl Daemon { async fn spawn_dataflow( &mut self, dataflow_id: uuid::Uuid, - nodes: BTreeMap, + working_dir: PathBuf, + nodes: Vec, ) -> eyre::Result<()> { let dataflow = match self.running.entry(dataflow_id) { std::collections::hash_map::Entry::Vacant(entry) => entry.insert(Default::default()), @@ -274,38 +267,39 @@ impl Daemon { bail!("there is already a running dataflow with ID `{dataflow_id}`") } }; - for (node_id, params) in nodes { - dataflow.running_nodes.insert(node_id.clone()); - for (input_id, mapping) in params.node.run_config.inputs.clone() { + for node in nodes { + dataflow.running_nodes.insert(node.id.clone()); + let inputs = node_inputs(&node); + + for (input_id, mapping) in inputs { dataflow .open_inputs - .entry(node_id.clone()) + .entry(node.id.clone()) .or_default() .insert(input_id.clone()); match mapping { InputMapping::User(mapping) => { - if mapping.operator.is_some() { - bail!("operators are not supported"); - } dataflow .mappings .entry((mapping.source, mapping.output)) .or_default() - .insert((node_id.clone(), input_id)); + .insert((node.id.clone(), input_id)); } InputMapping::Timer { interval } => { dataflow .timers .entry(interval) .or_default() - .insert((node_id.clone(), input_id)); + .insert((node.id.clone(), input_id)); } } } + let node_id = node.id.clone(); spawn::spawn_node( dataflow_id, - params, + &working_dir, + node, self.events_tx.clone(), self.shared_memory_handler_node.clone(), ) @@ -596,6 +590,40 @@ impl Daemon { } } +fn node_inputs(node: &ResolvedNode) -> BTreeMap { + match &node.kind { + CoreNodeKind::Custom(n) => n.run_config.inputs.clone(), + CoreNodeKind::Runtime(n) => runtime_node_inputs(n), + } +} + +fn runtime_node_inputs(n: &dora_core::descriptor::RuntimeNode) -> BTreeMap { + n.operators + .iter() + .flat_map(|operator| { + operator.config.inputs.iter().map(|(input_id, mapping)| { + ( + DataId::from(format!("{}/{input_id}", operator.id)), + mapping.clone(), + ) + }) + }) + .collect() +} + +fn runtime_node_outputs(n: &dora_core::descriptor::RuntimeNode) -> BTreeSet { + n.operators + .iter() + .flat_map(|operator| { + operator + .config + .outputs + .iter() + .map(|output_id| DataId::from(format!("{}/{output_id}", operator.id))) + }) + .collect() +} + async fn send_input_closed_events(dataflow: &mut RunningDataflow, mut filter: F) where F: FnMut(&(NodeId, DataId)) -> bool, diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index bda1f97e..ee51a46f 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,7 +1,11 @@ -use crate::{listener::listener_loop, shared_mem_handler, DoraEvent, Event}; +use crate::{ + listener::listener_loop, runtime_node_inputs, runtime_node_outputs, shared_mem_handler, + DoraEvent, Event, +}; use dora_core::{ - daemon_messages::{DataflowId, NodeConfig, SpawnNodeParams}, - descriptor::{resolve_path, source_is_url}, + config::NodeRunConfig, + daemon_messages::{DataflowId, NodeConfig, RuntimeConfig}, + descriptor::{resolve_path, source_is_url, OperatorSource, ResolvedNode}, }; use dora_download::download_file; use eyre::{eyre, WrapErr}; @@ -9,35 +13,34 @@ use shared_memory_server::{ShmemConf, ShmemServer}; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; use tokio::sync::mpsc; -#[tracing::instrument] pub async fn spawn_node( dataflow_id: DataflowId, - params: SpawnNodeParams, + working_dir: &Path, + node: ResolvedNode, daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, ) -> eyre::Result<()> { - let SpawnNodeParams { - node_id, - node, - working_dir, - } = params; - + let node_id = node.id.clone(); tracing::trace!("Spawning node `{dataflow_id}/{node_id}`"); - let resolved_path = if source_is_url(&node.source) { + let source = node_source(&node)?; + + let resolved_path = if source_is_url(&source) { // try to download the shared library let target_path = Path::new("build") .join(node_id.to_string()) .with_extension(EXE_EXTENSION); - download_file(&node.source, &target_path) + download_file(source, &target_path) .await .wrap_err("failed to download custom node")?; target_path.clone() } else { - resolve_path(&node.source, &working_dir) - .wrap_err_with(|| format!("failed to resolve node source `{}`", node.source))? + resolve_path(&source, &working_dir) + .wrap_err_with(|| format!("failed to resolve node source `{}`", source))? }; + tracing::info!("spawning {}", resolved_path.display()); + let daemon_control_region = ShmemConf::new() .size(4096) .create() @@ -46,14 +49,8 @@ pub async fn spawn_node( .size(4096) .create() .wrap_err("failed to allocate daemon_events_region")?; - let node_config = NodeConfig { - dataflow_id, - node_id: node_id.clone(), - run_config: node.run_config.clone(), - daemon_control_region_id: daemon_control_region.get_os_id().to_owned(), - daemon_events_region_id: daemon_events_region.get_os_id().to_owned(), - }; - + let daemon_control_region_id = daemon_control_region.get_os_id().to_owned(); + let daemon_events_region_id = daemon_events_region.get_os_id().to_owned(); { let server = unsafe { ShmemServer::new(daemon_control_region) } .wrap_err("failed to create control server")?; @@ -74,31 +71,66 @@ pub async fn spawn_node( } let mut command = tokio::process::Command::new(&resolved_path); - if let Some(args) = &node.args { - command.args(args.split_ascii_whitespace()); - } - command.env( - "DORA_NODE_CONFIG", - serde_yaml::to_string(&node_config).wrap_err("failed to serialize node config")?, - ); command.current_dir(working_dir); + command.stdin(Stdio::null()); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.envs { - for (key, value) in envs { - command.env(key, value.to_string()); + let mut child = match node.kind { + dora_core::descriptor::CoreNodeKind::Custom(n) => { + let node_config = NodeConfig { + dataflow_id, + node_id: node_id.clone(), + run_config: n.run_config.clone(), + daemon_control_region_id, + daemon_events_region_id, + }; + if let Some(args) = &n.args { + command.args(args.split_ascii_whitespace()); + } + command.env( + "DORA_NODE_CONFIG", + serde_yaml::to_string(&node_config).wrap_err("failed to serialize node config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = n.envs { + for (key, value) in envs { + command.env(key, value.to_string()); + } + } + command.spawn().wrap_err_with(move || { + format!( + "failed to run source path: `{}` with args `{}`", + resolved_path.display(), + n.args.as_deref().unwrap_or_default() + ) + })? } - } - command.stdin(Stdio::null()); + dora_core::descriptor::CoreNodeKind::Runtime(n) => { + let runtime_config = RuntimeConfig { + node: NodeConfig { + dataflow_id, + node_id: node_id.clone(), + run_config: NodeRunConfig { + inputs: runtime_node_inputs(&n), + outputs: runtime_node_outputs(&n), + }, + daemon_control_region_id, + daemon_events_region_id, + }, + operators: n.operators, + }; + command.env( + "DORA_RUNTIME_CONFIG", + serde_yaml::to_string(&runtime_config) + .wrap_err("failed to serialize runtime config")?, + ); + + command.spawn().wrap_err_with(move || { + format!("failed to run runtime at `{}`", resolved_path.display()) + })? + } + }; - let mut child = command.spawn().wrap_err_with(move || { - format!( - "failed to run source path: `{}` with args `{}`", - resolved_path.display(), - node.args.as_deref().unwrap_or_default() - ) - })?; let node_id_cloned = node_id.clone(); let wait_task = async move { let status = child.wait().await.context("child process failed")?; @@ -121,3 +153,29 @@ pub async fn spawn_node( }); Ok(()) } + +fn node_source(node: &ResolvedNode) -> eyre::Result<&str> { + match &node.kind { + dora_core::descriptor::CoreNodeKind::Runtime(node) => { + let has_python_operator = node + .operators + .iter() + .any(|x| matches!(x.config.source, OperatorSource::Python { .. })); + + let has_other_operator = node + .operators + .iter() + .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); + + if has_python_operator && !has_other_operator { + // Use python to spawn runtime if there is a python operator + Ok("python3") + } else if !has_python_operator && has_other_operator { + Ok("dora-runtime") + } else { + eyre::bail!("Runtime can not mix Python Operator with other type of operator."); + } + } + dora_core::descriptor::CoreNodeKind::Custom(node) => Ok(&node.source), + } +} diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index cf2bf84b..9b61d11c 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -21,8 +21,8 @@ pub fn main() -> eyre::Result<()> { set_up_tracing().context("failed to set up tracing subscriber")?; let config: RuntimeConfig = { - let raw = std::env::var("DORA_NODE_CONFIG") - .wrap_err("env variable DORA_NODE_CONFIG must be set")?; + let raw = std::env::var("DORA_RUNTIME_CONFIG") + .wrap_err("env variable DORA_RUNTIME_CONFIG must be set")?; serde_yaml::from_str(&raw).context("failed to deserialize operator config")? }; let RuntimeConfig { diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index a1789745..1013f0ce 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -2,7 +2,7 @@ use std::{collections::BTreeMap, path::PathBuf}; use crate::{ config::{DataId, NodeId, NodeRunConfig}, - descriptor::{self, OperatorDefinition}, + descriptor::{self, OperatorDefinition, ResolvedNode}, }; use dora_message::Metadata; use uuid::Uuid; @@ -115,12 +115,6 @@ pub type DataflowId = Uuid; #[derive(Debug, serde::Deserialize, serde::Serialize)] pub struct SpawnDataflowNodes { pub dataflow_id: DataflowId, - pub nodes: BTreeMap, -} - -#[derive(Debug, serde::Deserialize, serde::Serialize)] -pub struct SpawnNodeParams { - pub node_id: NodeId, - pub node: descriptor::CustomNode, pub working_dir: PathBuf, + pub nodes: Vec, } From 6aaf4b845cb3ee68f6e0d566dffc02c416c0001b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 30 Jan 2023 17:27:47 +0100 Subject: [PATCH 123/225] ShmemServer: Set `disconnect` flag on drop and check it on requests --- libraries/shared-memory-server/src/channel.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libraries/shared-memory-server/src/channel.rs b/libraries/shared-memory-server/src/channel.rs index 10643122..d50ef8ad 100644 --- a/libraries/shared-memory-server/src/channel.rs +++ b/libraries/shared-memory-server/src/channel.rs @@ -110,6 +110,12 @@ impl ShmemChannel { event .set(EventState::Signaled) .map_err(|err| eyre!("failed to send message over ShmemChannel: {err}"))?; + + let disconnected = self.disconnect().load(std::sync::atomic::Ordering::Acquire); + if disconnected { + eyre::bail!("server closed the connection"); + } + Ok(()) } @@ -200,6 +206,9 @@ impl Drop for ShmemChannel { tracing::debug!("closing ShmemServer after client disconnect"); } else { tracing::error!("ShmemServer closed before client disconnect"); + + self.disconnect() + .store(true, std::sync::atomic::Ordering::Release); } } else { tracing::debug!("disconnecting client"); From fcea397330b10b5c319e2862decffc5197b7c090 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 2 Feb 2023 16:37:17 +0100 Subject: [PATCH 124/225] Fix spawning of runtime through `python3` --- binaries/daemon/src/spawn.rs | 95 ++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index ee51a46f..6616b30c 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -21,25 +21,7 @@ pub async fn spawn_node( shmem_handler_tx: flume::Sender, ) -> eyre::Result<()> { let node_id = node.id.clone(); - tracing::trace!("Spawning node `{dataflow_id}/{node_id}`"); - - let source = node_source(&node)?; - - let resolved_path = if source_is_url(&source) { - // try to download the shared library - let target_path = Path::new("build") - .join(node_id.to_string()) - .with_extension(EXE_EXTENSION); - download_file(source, &target_path) - .await - .wrap_err("failed to download custom node")?; - target_path.clone() - } else { - resolve_path(&source, &working_dir) - .wrap_err_with(|| format!("failed to resolve node source `{}`", source))? - }; - - tracing::info!("spawning {}", resolved_path.display()); + tracing::debug!("Spawning node `{dataflow_id}/{node_id}`"); let daemon_control_region = ShmemConf::new() .size(4096) @@ -70,12 +52,26 @@ pub async fn spawn_node( }); } - let mut command = tokio::process::Command::new(&resolved_path); - command.current_dir(working_dir); - command.stdin(Stdio::null()); - let mut child = match node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { + let resolved_path = if source_is_url(&n.source) { + // try to download the shared library + let target_path = Path::new("build") + .join(node_id.to_string()) + .with_extension(EXE_EXTENSION); + download_file(&n.source, &target_path) + .await + .wrap_err("failed to download custom node")?; + target_path.clone() + } else { + resolve_path(&n.source, &working_dir) + .wrap_err_with(|| format!("failed to resolve node source `{}`", n.source))? + }; + + tracing::info!("spawning {}", resolved_path.display()); + let mut command = tokio::process::Command::new(&resolved_path); + command.current_dir(working_dir); + command.stdin(Stdio::null()); let node_config = NodeConfig { dataflow_id, node_id: node_id.clone(), @@ -106,6 +102,29 @@ pub async fn spawn_node( })? } dora_core::descriptor::CoreNodeKind::Runtime(n) => { + let has_python_operator = n + .operators + .iter() + .any(|x| matches!(x.config.source, OperatorSource::Python { .. })); + + let has_other_operator = n + .operators + .iter() + .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); + + let mut command = if has_python_operator && !has_other_operator { + // Use python to spawn runtime if there is a python operator + let mut command = tokio::process::Command::new("python3"); + command.args(["-c", "import dora; dora.start_runtime()"]); + command + } else if !has_python_operator && has_other_operator { + tokio::process::Command::new("dora-runtime") + } else { + eyre::bail!("Runtime can not mix Python Operator with other type of operator."); + }; + command.current_dir(working_dir); + command.stdin(Stdio::null()); + let runtime_config = RuntimeConfig { node: NodeConfig { dataflow_id, @@ -125,9 +144,7 @@ pub async fn spawn_node( .wrap_err("failed to serialize runtime config")?, ); - command.spawn().wrap_err_with(move || { - format!("failed to run runtime at `{}`", resolved_path.display()) - })? + command.spawn().wrap_err("failed to run runtime")? } }; @@ -153,29 +170,3 @@ pub async fn spawn_node( }); Ok(()) } - -fn node_source(node: &ResolvedNode) -> eyre::Result<&str> { - match &node.kind { - dora_core::descriptor::CoreNodeKind::Runtime(node) => { - let has_python_operator = node - .operators - .iter() - .any(|x| matches!(x.config.source, OperatorSource::Python { .. })); - - let has_other_operator = node - .operators - .iter() - .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); - - if has_python_operator && !has_other_operator { - // Use python to spawn runtime if there is a python operator - Ok("python3") - } else if !has_python_operator && has_other_operator { - Ok("dora-runtime") - } else { - eyre::bail!("Runtime can not mix Python Operator with other type of operator."); - } - } - dora_core::descriptor::CoreNodeKind::Custom(node) => Ok(&node.source), - } -} From 26def8d26aa7485fddeac645c33d9a2c42fbbaee Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 2 Feb 2023 16:59:49 +0100 Subject: [PATCH 125/225] Fix parsing of dora timer inputs --- libraries/core/src/config.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/libraries/core/src/config.rs b/libraries/core/src/config.rs index e1ecadbe..c6170e19 100644 --- a/libraries/core/src/config.rs +++ b/libraries/core/src/config.rs @@ -164,9 +164,9 @@ impl<'de> Deserialize<'de> for InputMapping { .split_once('/') .ok_or_else(|| serde::de::Error::custom("input must start with `/`"))?; - let deserialized = if let Some(dora_output) = source.strip_prefix("dora/") { - match dora_output { - "timer" => { + let deserialized = match source { + "dora" => match output.split_once('/') { + Some(("timer", output)) => { let (unit, value) = output.split_once('/').ok_or_else(|| { serde::de::Error::custom( "timer input must specify unit and value (e.g. `secs/5` or `millis/100`)", @@ -197,17 +197,21 @@ impl<'de> Deserialize<'de> for InputMapping { }; Self::Timer { interval } } - other => { + Some((other, _)) => { return Err(serde::de::Error::custom(format!( "unknown dora input `{other}`" ))) } - } - } else { - Self::User(UserInputMapping { + None => { + return Err(serde::de::Error::custom(format!( + "dora input has invalid format" + ))) + } + }, + _ => Self::User(UserInputMapping { source: source.to_owned().into(), output: output.to_owned().into(), - }) + }), }; Ok(deserialized) From 772fc41d936fe7f4c97cf7c25c01de38934054c5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 2 Feb 2023 17:07:38 +0100 Subject: [PATCH 126/225] Slight improvements to log messages --- binaries/daemon/src/listener.rs | 2 -- binaries/runtime/src/lib.rs | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener.rs index 9ddf605c..9e12a2de 100644 --- a/binaries/daemon/src/listener.rs +++ b/binaries/daemon/src/listener.rs @@ -144,8 +144,6 @@ impl Listener { // remove that event if let Some(event) = self.queue.remove(index) { - tracing::debug!("dropping event {event:?}"); - if let NodeEvent::Input { data: Some(data), .. } = event diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 9b61d11c..686c9c02 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -105,6 +105,7 @@ pub fn main() -> eyre::Result<()> { Ok(()) } +#[tracing::instrument(skip(node, events, operator_channels), fields(node.id))] async fn run( mut node: DoraNode, operators: HashMap, From 248afa9bccba4d43d7c9a28678bf11075ce7350f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 3 Feb 2023 10:21:31 +0100 Subject: [PATCH 127/225] Update `check_input` for changes to `UserInputMapping` --- binaries/cli/src/check.rs | 68 ++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/binaries/cli/src/check.rs b/binaries/cli/src/check.rs index 005c517d..51e280d7 100644 --- a/binaries/cli/src/check.rs +++ b/binaries/cli/src/check.rs @@ -1,12 +1,12 @@ use crate::{control_connection, graph::read_descriptor}; use dora_core::{ adjust_shared_library_path, - config::{InputMapping, UserInputMapping}, + config::{DataId, InputMapping, OperatorId, UserInputMapping}, descriptor::{self, source_is_url, CoreNodeKind, OperatorSource}, topics::ControlRequest, }; use eyre::{bail, eyre, Context}; -use std::{env::consts::EXE_EXTENSION, io::Write, path::Path}; +use std::{env::consts::EXE_EXTENSION, io::Write, path::Path, str::FromStr}; use termcolor::{Color, ColorChoice, ColorSpec, WriteColor}; pub fn check_environment() -> eyre::Result<()> { @@ -193,55 +193,41 @@ fn check_input( ) -> Result<(), eyre::ErrReport> { match mapping { InputMapping::Timer { interval: _ } => {} - InputMapping::User(UserInputMapping { - source, - operator, - output, - }) => { + InputMapping::User(UserInputMapping { source, output }) => { let source_node = nodes.iter().find(|n| &n.id == source).ok_or_else(|| { eyre!("source node `{source}` mapped to input `{input_id_str}` does not exist",) })?; - if let Some(operator_id) = operator { - let operator = match &source_node.kind { - CoreNodeKind::Runtime(runtime) => { - let operator = runtime.operators.iter().find(|o| &o.id == operator_id); - operator.ok_or_else(|| { + match &source_node.kind { + CoreNodeKind::Custom(custom_node) => { + if !custom_node.run_config.outputs.contains(output) { + bail!( + "output `{source}/{output}` mapped to \ + input `{input_id_str}` does not exist", + ); + } + } + CoreNodeKind::Runtime(runtime) => { + let (operator_id, output) = output.split_once('/').unwrap_or_default(); + let operator_id = OperatorId::from(operator_id.to_owned()); + let output = DataId::from(output.to_owned()); + + let operator = runtime + .operators + .iter() + .find(|o| o.id == operator_id) + .ok_or_else(|| { eyre!( "source operator `{source}/{operator_id}` used \ for input `{input_id_str}` does not exist", ) - })? - } - CoreNodeKind::Custom(_) => { + })?; + + if !operator.config.outputs.contains(&output) { bail!( - "input `{input_id_str}` references operator \ - `{source}/{operator_id}`, but `{source}` is a \ - custom node", + "output `{source}/{operator_id}/{output}` mapped to \ + input `{input_id_str}` does not exist", ); } - }; - - if !operator.config.outputs.contains(output) { - bail!( - "output `{source}/{operator_id}/{output}` mapped to \ - input `{input_id_str}` does not exist", - ); - } - } else { - match &source_node.kind { - CoreNodeKind::Runtime(_) => bail!( - "input `{input_id_str}` references output \ - `{source}/{output}`, but `{source}` is a \ - runtime node", - ), - CoreNodeKind::Custom(custom_node) => { - if !custom_node.run_config.outputs.contains(output) { - bail!( - "output `{source}/{output}` mapped to \ - input `{input_id_str}` does not exist", - ); - } - } } } } From 3b79e4cca0af815fbe3087e52a27bc32abb5168e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 3 Feb 2023 10:45:07 +0100 Subject: [PATCH 128/225] Fix: Don't ignore errors in `run.sh` of Python example --- examples/python-dataflow/run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/python-dataflow/run.sh b/examples/python-dataflow/run.sh index c4036109..0870e1b8 100644 --- a/examples/python-dataflow/run.sh +++ b/examples/python-dataflow/run.sh @@ -1,3 +1,5 @@ +set -e + python3 -m venv .env . $(pwd)/.env/bin/activate # Dev dependencies From 2e67362c155e3a8375ab2df087864a75524b97d5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 3 Feb 2023 10:52:35 +0100 Subject: [PATCH 129/225] Fix: Python operator spawning is blocking, so do it in a separate thread --- binaries/runtime/src/lib.rs | 26 +++++++++++++++---------- binaries/runtime/src/operator/mod.rs | 4 ++-- binaries/runtime/src/operator/python.rs | 3 ++- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 686c9c02..dde4837f 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -7,9 +7,9 @@ use dora_core::{ }; use dora_node_api::DoraNode; use eyre::{bail, Context, Result}; -use futures::{Stream, StreamExt}; +use futures::{stream::FuturesUnordered, Stream, StreamExt}; use futures_concurrency::Merge; -use operator::{spawn_operator, OperatorEvent, StopReason}; +use operator::{run_operator, OperatorEvent, StopReason}; use std::{collections::HashMap, mem}; use tokio::{runtime::Builder, sync::mpsc}; @@ -78,19 +78,21 @@ pub fn main() -> eyre::Result<()> { .wrap_err("Could not build a tokio runtime.")?; let mut operator_channels = HashMap::new(); + let operator_threads = FuturesUnordered::new(); for operator_config in &operators { - let events_tx = operator_events_tx.get(&operator_config.id).unwrap(); + let events_tx = operator_events_tx.get(&operator_config.id).unwrap().clone(); let (operator_tx, incoming_events) = mpsc::channel(10); - spawn_operator( - &node_id, - operator_config.clone(), - incoming_events, - events_tx.clone(), - ) - .wrap_err_with(|| format!("failed to init operator {}", operator_config.id))?; + let operator_definition = operator_config.clone(); + let node_id = node_id.clone(); + let task = std::thread::spawn(move || { + let operator_id = operator_definition.id.clone(); + run_operator(&node_id, operator_definition, incoming_events, events_tx) + .wrap_err_with(|| format!("failed to init operator {operator_id}")) + }); operator_channels.insert(operator_config.id.clone(), operator_tx); + operator_threads.push(task); } let operator_config = operators.into_iter().map(|c| (c.id, c.config)).collect(); @@ -102,6 +104,10 @@ pub fn main() -> eyre::Result<()> { .join() .map_err(|err| eyre::eyre!("Stop thread failed with err: {err:#?}"))? .wrap_err("Stop loop thread failed unexpectedly.")?; + for thread in operator_threads { + thread.join().unwrap()?; + } + Ok(()) } diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index 0d0a2b4a..e590cea1 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -15,7 +15,7 @@ type Tracer = (); mod python; // mod shared_lib; -pub fn spawn_operator( +pub fn run_operator( node_id: &NodeId, operator_definition: OperatorDefinition, incoming_events: Receiver, @@ -49,7 +49,7 @@ pub fn spawn_operator( todo!() } OperatorSource::Python(source) => { - python::spawn( + python::run( node_id, &operator_definition.id, source, diff --git a/binaries/runtime/src/operator/python.rs b/binaries/runtime/src/operator/python.rs index 2a38879b..c7b6f595 100644 --- a/binaries/runtime/src/operator/python.rs +++ b/binaries/runtime/src/operator/python.rs @@ -34,7 +34,8 @@ fn traceback(err: pyo3::PyErr) -> eyre::Report { }) } -pub fn spawn( +#[tracing::instrument(skip(events_tx, incoming_events, tracer))] +pub fn run( node_id: &NodeId, operator_id: &OperatorId, source: &str, From ea136adc2ed5f42fc21a3730f75e40ea92bca1b5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 3 Feb 2023 11:22:17 +0100 Subject: [PATCH 130/225] Fix: Run Python operator in main thread --- binaries/runtime/src/lib.rs | 82 ++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 47 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index dde4837f..9b4c1f49 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -7,13 +7,13 @@ use dora_core::{ }; use dora_node_api::DoraNode; use eyre::{bail, Context, Result}; -use futures::{stream::FuturesUnordered, Stream, StreamExt}; +use futures::{Stream, StreamExt}; use futures_concurrency::Merge; use operator::{run_operator, OperatorEvent, StopReason}; use std::{collections::HashMap, mem}; use tokio::{runtime::Builder, sync::mpsc}; -use tokio_stream::{wrappers::ReceiverStream, StreamMap}; +use tokio_stream::wrappers::ReceiverStream; mod operator; @@ -32,31 +32,21 @@ pub fn main() -> eyre::Result<()> { let node_id = config.node_id.clone(); let (node, daemon_events) = DoraNode::init(config)?; - let mut operator_events = StreamMap::new(); - // let mut operator_stop_publishers = HashMap::new(); - let mut operator_events_tx = HashMap::new(); - - for operator_config in &operators { - let (events_tx, events) = mpsc::channel(1); - // let stop_publisher = publisher( - // &config.node_id, - // operator_config.id.clone(), - // STOP_TOPIC.to_owned().into(), - // communication.as_mut(), - // ) - // .with_context(|| { - // format!( - // "failed to create stop publisher for operator {}", - // operator_config.id - // ) - // })?; - // operator_stop_publishers.insert(operator_config.id.clone(), stop_publisher); - - operator_events.insert(operator_config.id.clone(), ReceiverStream::new(events)); - operator_events_tx.insert(operator_config.id.clone(), events_tx); - } + let operator_definition = if operators.is_empty() { + bail!("no operators"); + } else if operators.len() > 1 { + bail!("multiple operators are not supported"); + } else { + let mut ops = operators; + ops.remove(0) + }; - let operator_events = operator_events.map(|(id, event)| Event::Operator { id, event }); + let (operator_events_tx, events) = mpsc::channel(1); + let operator_id = operator_definition.id.clone(); + let operator_events = ReceiverStream::new(events).map(move |event| Event::Operator { + id: operator_id.clone(), + event, + }); let daemon_events = futures::stream::unfold(daemon_events, |mut stream| async { let event = stream.recv_async().await.map(|event| match event { dora_node_api::daemon::Event::Stop => Event::Stop, @@ -78,35 +68,33 @@ pub fn main() -> eyre::Result<()> { .wrap_err("Could not build a tokio runtime.")?; let mut operator_channels = HashMap::new(); - let operator_threads = FuturesUnordered::new(); - - for operator_config in &operators { - let events_tx = operator_events_tx.get(&operator_config.id).unwrap().clone(); - let (operator_tx, incoming_events) = mpsc::channel(10); - let operator_definition = operator_config.clone(); - let node_id = node_id.clone(); - let task = std::thread::spawn(move || { - let operator_id = operator_definition.id.clone(); - run_operator(&node_id, operator_definition, incoming_events, events_tx) - .wrap_err_with(|| format!("failed to init operator {operator_id}")) - }); - - operator_channels.insert(operator_config.id.clone(), operator_tx); - operator_threads.push(task); - } - - let operator_config = operators.into_iter().map(|c| (c.id, c.config)).collect(); + let (operator_channel, incoming_events) = mpsc::channel(10); + operator_channels.insert(operator_definition.id.clone(), operator_channel); + + tracing::info!("spawning main task"); + let operator_config = [( + operator_definition.id.clone(), + operator_definition.config.clone(), + )] + .into_iter() + .collect(); let main_task = std::thread::spawn(move || -> Result<()> { tokio_runtime.block_on(run(node, operator_config, events, operator_channels)) }); + let operator_id = operator_definition.id.clone(); + run_operator( + &node_id, + operator_definition, + incoming_events, + operator_events_tx, + ) + .wrap_err_with(|| format!("failed to run operator {operator_id}"))?; + main_task .join() .map_err(|err| eyre::eyre!("Stop thread failed with err: {err:#?}"))? .wrap_err("Stop loop thread failed unexpectedly.")?; - for thread in operator_threads { - thread.join().unwrap()?; - } Ok(()) } From 0d1bbc8eb527e727f6cb322b9ecfc7004031e1ef Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 12:26:54 +0100 Subject: [PATCH 131/225] Fix: actually send input to runtime instead of dropping future --- binaries/runtime/src/lib.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 9b4c1f49..f1336648 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -199,11 +199,13 @@ async fn run( continue; }; - let _ = operator_channel.send(operator::IncomingEvent::Input { - input_id, - metadata, - data, - }); + operator_channel + .send(operator::IncomingEvent::Input { + input_id, + metadata, + data, + }) + .await?; } Event::InputClosed(_) => {} Event::Error(err) => eyre::bail!("received error event: {err}"), From 77c4b11c8d89ab0187ffe283198bc4c667ef3525 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 12:27:31 +0100 Subject: [PATCH 132/225] Fix: adjust input mappings that refer to single operator nodes --- libraries/core/src/descriptor/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libraries/core/src/descriptor/mod.rs b/libraries/core/src/descriptor/mod.rs index 558c215b..238ea75b 100644 --- a/libraries/core/src/descriptor/mod.rs +++ b/libraries/core/src/descriptor/mod.rs @@ -46,6 +46,14 @@ impl Descriptor { NodeKind::Custom(node) => node.run_config.inputs.values_mut().collect(), NodeKind::Operator(operator) => operator.config.inputs.values_mut().collect(), }; + for mapping in input_mappings.into_iter().filter_map(|m| match m { + InputMapping::Timer { .. } => None, + InputMapping::User(m) => Some(m), + }) { + if let Some(op_name) = single_operator_nodes.get(&mapping.source).copied() { + mapping.output = DataId::from(format!("{op_name}/{}", mapping.output)); + } + } // resolve nodes let kind = match node.kind { From 6980636686bb2a26b7731cd766e31b955e88a7ed Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 21:26:29 +0100 Subject: [PATCH 133/225] Update futures-concurrency to v7.1 --- Cargo.lock | 31 +++++-------------------------- binaries/coordinator/Cargo.toml | 2 +- binaries/daemon/Cargo.toml | 2 +- binaries/runtime/Cargo.toml | 2 +- binaries/runtime/src/lib.rs | 6 +++--- 5 files changed, 11 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f8d0ca3..7184e3fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -962,7 +962,7 @@ dependencies = [ "dora-node-api", "eyre", "futures", - "futures-concurrency 5.0.1", + "futures-concurrency", "rand", "serde", "serde_json", @@ -1005,7 +1005,7 @@ dependencies = [ "eyre", "flume", "futures", - "futures-concurrency 7.0.0", + "futures-concurrency", "serde", "serde_json", "serde_yaml 0.8.23", @@ -1192,7 +1192,7 @@ dependencies = [ "fern", "flume", "futures", - "futures-concurrency 2.0.3", + "futures-concurrency", "libloading", "opentelemetry", "opentelemetry-system-metrics", @@ -1388,30 +1388,9 @@ dependencies = [ [[package]] name = "futures-concurrency" -version = "2.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e98b7b5aedee7c34a5cfb1ee1681af8faf46e2f30c0b8af5ea08eba517d61c" -dependencies = [ - "async-trait", - "futures-core", - "pin-project", -] - -[[package]] -name = "futures-concurrency" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "407ed2aa475d777e35fb167144b63babd0377b2f9a528ae3ec4bec94f1ce1f1a" -dependencies = [ - "futures-core", - "pin-project", -] - -[[package]] -name = "futures-concurrency" -version = "7.0.0" +version = "7.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a740c32e1bde284ce2f51df98abd4fa38e9e539670443c111211777e3ab09927" +checksum = "e06f199437c8a435c12ad153c5a1f4e131871cf6f6025585bb15e8cbb414f0dc" dependencies = [ "bitvec", "futures-core", diff --git a/binaries/coordinator/Cargo.toml b/binaries/coordinator/Cargo.toml index 0bec9dd6..1a75c20f 100644 --- a/binaries/coordinator/Cargo.toml +++ b/binaries/coordinator/Cargo.toml @@ -24,7 +24,7 @@ dora-core = { workspace = true } dora-message = { path = "../../libraries/message" } tracing = "0.1.36" tracing-subscriber = "0.3.15" -futures-concurrency = "5.0.1" +futures-concurrency = "7.1.0" zenoh = { git = "https://github.com/eclipse-zenoh/zenoh.git", rev = "79a136e4fd90b11ff5d775ced981af53c4f1071b" } serde_json = "1.0.86" dora-download = { path = "../../libraries/extensions/download" } diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 6be78b6a..3fcf4f13 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -11,7 +11,7 @@ tokio = { version = "1.20.1", features = ["full"] } tokio-stream = { version = "0.1.11", features = ["net"] } tracing = "0.1.36" tracing-subscriber = "0.3.15" -futures-concurrency = "7.0.0" +futures-concurrency = "7.1.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.86" dora-core = { path = "../../libraries/core" } diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index 033db80c..c94bb672 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -21,7 +21,7 @@ opentelemetry = { version = "0.17", features = [ opentelemetry-system-metrics = { version = "0.1.1", optional = true } eyre = "0.6.8" futures = "0.3.21" -futures-concurrency = "2.0.3" +futures-concurrency = "7.1.0" libloading = "0.7.3" serde_yaml = "0.8.23" tokio = { version = "1.17.0", features = ["full"] } diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index f1336648..829b2710 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -8,7 +8,7 @@ use dora_core::{ use dora_node_api::DoraNode; use eyre::{bail, Context, Result}; use futures::{Stream, StreamExt}; -use futures_concurrency::Merge; +use futures_concurrency::stream::Merge; use operator::{run_operator, OperatorEvent, StopReason}; use std::{collections::HashMap, mem}; @@ -47,7 +47,7 @@ pub fn main() -> eyre::Result<()> { id: operator_id.clone(), event, }); - let daemon_events = futures::stream::unfold(daemon_events, |mut stream| async { + let daemon_events = Box::pin(futures::stream::unfold(daemon_events, |mut stream| async { let event = stream.recv_async().await.map(|event| match event { dora_node_api::daemon::Event::Stop => Event::Stop, dora_node_api::daemon::Event::Input { id, metadata, data } => Event::Input { @@ -60,7 +60,7 @@ pub fn main() -> eyre::Result<()> { _ => todo!(), }); event.map(|event| (event, stream)) - }); + })); let events = (operator_events, daemon_events).merge(); let tokio_runtime = Builder::new_current_thread() .enable_all() From c27d7d1a3498086ce803d12d4567ac5079290220 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 21:27:27 +0100 Subject: [PATCH 134/225] Propagate panics of runtime main task --- binaries/runtime/src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 829b2710..0b651dbf 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -91,10 +91,10 @@ pub fn main() -> eyre::Result<()> { ) .wrap_err_with(|| format!("failed to run operator {operator_id}"))?; - main_task - .join() - .map_err(|err| eyre::eyre!("Stop thread failed with err: {err:#?}"))? - .wrap_err("Stop loop thread failed unexpectedly.")?; + match main_task.join() { + Ok(result) => result.wrap_err("Stop loop thread failed unexpectedly.")?, + Err(panic) => std::panic::resume_unwind(panic), + } Ok(()) } From 18a908c579d3707c42c7265444be263649c3cce1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 21:37:19 +0100 Subject: [PATCH 135/225] Close operator event channel after last input is closed --- binaries/runtime/src/lib.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 0b651dbf..f390d9ed 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -11,7 +11,11 @@ use futures::{Stream, StreamExt}; use futures_concurrency::stream::Merge; use operator::{run_operator, OperatorEvent, StopReason}; -use std::{collections::HashMap, mem}; +use core::fmt; +use std::{ + collections::{BTreeSet, HashMap}, + mem, +}; use tokio::{runtime::Builder, sync::mpsc}; use tokio_stream::wrappers::ReceiverStream; @@ -118,6 +122,11 @@ async fn run( _started }; + let mut open_operator_inputs: HashMap<_, BTreeSet<_>> = operators + .iter() + .map(|(id, config)| (id, config.inputs.keys().collect())) + .collect(); + // let mut stopped_operators = BTreeSet::new(); while let Some(event) = events.next().await { @@ -207,7 +216,24 @@ async fn run( }) .await?; } - Event::InputClosed(_) => {} + Event::InputClosed(id) => { + let Some((operator_id, input_id)) = id.as_str().split_once('/') else { + tracing::warn!("received InputClosed event for non-operator input {id}"); + continue; + }; + let operator_id = OperatorId::from(operator_id.to_owned()); + let input_id = DataId::from(input_id.to_owned()); + + if let Some(open_inputs) = open_operator_inputs.get_mut(&operator_id) { + open_inputs.remove(&input_id); + if open_inputs.is_empty() { + // all inputs of the node were closed -> close its event channel + tracing::info!("all inputs of operator {operator_id} were closed -> closing event channel"); + open_operator_inputs.remove(&operator_id); + operator_channels.remove(&operator_id); + } + } + } Event::Error(err) => eyre::bail!("received error event: {err}"), } } From 70bfa35d0c1f3f410758bce63be3cf093f154a95 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 21:44:10 +0100 Subject: [PATCH 136/225] Use rendezvous channel for forwarding inputs in node API Since inputs are passed through borrowed shared memory, we need to report back when we're done with them. We do this through a drop handler. By using a rendezvous channel for forwarding, we avoid that the drop handler is stuck in the queue if the node finished already. --- apis/rust/node/src/daemon.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index ed137a41..d428aa94 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -195,7 +195,7 @@ impl EventStream { .map_err(|e| eyre!(e)) .wrap_err("failed to create subscription with dora-daemon")?; - let (tx, rx) = flume::bounded(1); + let (tx, rx) = flume::bounded(0); let mut drop_tokens = Vec::new(); let thread = std::thread::spawn(move || loop { let daemon_request = DaemonRequest::NextEvent { From 9e687be7cd9ec511798ca4a0c95645c6b44c7cfb Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 8 Feb 2023 23:14:46 +0100 Subject: [PATCH 137/225] Add a ctrl-c handler for dora-daemon that stops all connected nodes --- Cargo.lock | 128 +++++++++++++++++++++++++++++++++++-- binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/lib.rs | 17 +++++ 3 files changed, 139 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7184e3fc..85e40216 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -786,6 +786,16 @@ dependencies = [ "syn", ] +[[package]] +name = "ctrlc" +version = "3.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbcf33c2a618cbe41ee43ae6e9f2e48368cd9f9db2896f10167d8d762679f639" +dependencies = [ + "nix 0.26.2", + "windows-sys 0.45.0", +] + [[package]] name = "cty" version = "0.2.2" @@ -999,6 +1009,7 @@ name = "dora-daemon" version = "0.1.0" dependencies = [ "clap 3.2.20", + "ctrlc", "dora-core", "dora-download", "dora-message", @@ -1180,6 +1191,7 @@ name = "dora-runtime" version = "0.1.2" dependencies = [ "clap 3.2.20", + "ctrlc", "dora-core", "dora-download", "dora-message", @@ -1941,9 +1953,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.121" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libloading" @@ -2303,6 +2315,18 @@ dependencies = [ "memoffset", ] +[[package]] +name = "nix" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "libc", + "static_assertions", +] + [[package]] name = "ntapi" version = "0.3.7" @@ -2312,6 +2336,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-bigint-dig" version = "0.7.0" @@ -2573,6 +2607,12 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "parking" version = "2.0.0" @@ -3318,6 +3358,8 @@ version = "0.1.2" dependencies = [ "dora-node-api", "eyre", + "tracing", + "tracing-subscriber", ] [[package]] @@ -3680,9 +3722,9 @@ checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" [[package]] name = "smallvec" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" @@ -3718,6 +3760,12 @@ dependencies = [ "der", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stop-token" version = "0.7.0" @@ -4143,11 +4191,11 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.15" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60db860322da191b40952ad9affe65ea23e7dd6a5c442c2c42865810c6ab8e6b" +checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" dependencies = [ - "ansi_term", + "nu-ansi-term", "sharded-slab", "smallvec", "thread_local", @@ -4626,6 +4674,36 @@ dependencies = [ "windows_x86_64_msvc 0.36.1", ] +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc 0.42.1", + "windows_i686_gnu 0.42.1", + "windows_i686_msvc 0.42.1", + "windows_x86_64_gnu 0.42.1", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc 0.42.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + [[package]] name = "windows_aarch64_msvc" version = "0.32.0" @@ -4638,6 +4716,12 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + [[package]] name = "windows_i686_gnu" version = "0.32.0" @@ -4650,6 +4734,12 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + [[package]] name = "windows_i686_msvc" version = "0.32.0" @@ -4662,6 +4752,12 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + [[package]] name = "windows_x86_64_gnu" version = "0.32.0" @@ -4674,6 +4770,18 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + [[package]] name = "windows_x86_64_msvc" version = "0.32.0" @@ -4686,6 +4794,12 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" + [[package]] name = "winreg" version = "0.10.1" diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 3fcf4f13..50928b6c 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -23,3 +23,4 @@ uuid = { version = "1.1.2", features = ["v4"] } futures = "0.3.25" clap = { version = "3.1.8", features = ["derive"] } shared-memory-server = { path = "../../libraries/shared-memory-server" } +ctrlc = "3.2.5" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 249748b4..b46b8fe2 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -115,6 +115,15 @@ impl Daemon { exit_when_done: Option>, ) -> eyre::Result<()> { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); + let ctrlc_tx = dora_events_tx.clone(); + ctrlc::set_handler(move || { + tracing::info!("received ctrc signal"); + if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { + tracing::error!("failed to report ctrl-c event to dora-daemon"); + } + }) + .wrap_err("failed to set ctrl-c handler")?; + let (shared_memory_handler, shared_memory_daemon_rx) = flume::unbounded(); let (shared_memory_handler_node, shared_memory_node_rx) = flume::bounded(10); let daemon = Self { @@ -198,6 +207,13 @@ impl Daemon { .wrap_err("received unexpected watchdog reply from coordinator")?; } } + Event::CtrlC => { + for dataflow in self.running.values_mut() { + for (_node_id, channel) in dataflow.subscribe_channels.drain() { + let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; + } + } + } } let elapsed = start.elapsed(); @@ -681,6 +697,7 @@ pub enum Event { Dora(DoraEvent), ShmemHandler(ShmemHandlerEvent), WatchdogInterval, + CtrlC, } impl From for Event { From 9e36eeb3e3b9dd937468eaa7d6dc5d599100bbb3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 10 Feb 2023 13:00:17 +0100 Subject: [PATCH 138/225] Rename file --- binaries/daemon/src/{listener.rs => listener/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename binaries/daemon/src/{listener.rs => listener/mod.rs} (100%) diff --git a/binaries/daemon/src/listener.rs b/binaries/daemon/src/listener/mod.rs similarity index 100% rename from binaries/daemon/src/listener.rs rename to binaries/daemon/src/listener/mod.rs From ff836d200c429aad44d295188be3aa6e229fee4e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 10 Feb 2023 14:20:10 +0100 Subject: [PATCH 139/225] Start adding back support for daemon communication over TCP --- Cargo.lock | 1 + apis/rust/node/src/daemon.rs | 38 +-- apis/rust/node/src/lib.rs | 12 +- binaries/cli/src/check.rs | 2 +- binaries/coordinator/src/run/mod.rs | 1 + binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/lib.rs | 15 +- binaries/daemon/src/listener/mod.rs | 300 +-------------------- binaries/daemon/src/listener/shmem.rs | 298 +++++++++++++++++++++ binaries/daemon/src/listener/tcp.rs | 368 ++++++++++++++++++++++++++ binaries/daemon/src/spawn.rs | 124 ++++++--- libraries/core/src/daemon_messages.rs | 31 ++- libraries/core/src/descriptor/mod.rs | 7 +- 13 files changed, 827 insertions(+), 371 deletions(-) create mode 100644 binaries/daemon/src/listener/shmem.rs create mode 100644 binaries/daemon/src/listener/tcp.rs diff --git a/Cargo.lock b/Cargo.lock index 85e40216..58af7d2b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1008,6 +1008,7 @@ dependencies = [ name = "dora-daemon" version = "0.1.0" dependencies = [ + "bincode", "clap 3.2.20", "ctrlc", "dora-core", diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon.rs index d428aa94..2feae668 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon.rs @@ -1,6 +1,6 @@ use dora_core::{ config::{DataId, NodeId}, - daemon_messages::{DaemonReply, DaemonRequest, DataflowId, NodeEvent}, + daemon_messages::{DaemonCommunication, DaemonReply, DaemonRequest, DataflowId, NodeEvent}, }; use dora_message::Metadata; use eyre::{bail, eyre, Context}; @@ -17,21 +17,29 @@ impl DaemonConnection { pub(crate) fn init( dataflow_id: DataflowId, node_id: &NodeId, - daemon_control_region_id: &str, - daemon_events_region_id: &str, + daemon_communication: &DaemonCommunication, ) -> eyre::Result { - let control_channel = ControlChannel::init(dataflow_id, node_id, daemon_control_region_id) - .wrap_err("failed to init control stream")?; - - let (event_stream, event_stream_thread) = - EventStream::init(dataflow_id, node_id, daemon_events_region_id) - .wrap_err("failed to init event stream")?; - - Ok(Self { - control_channel, - event_stream, - event_stream_thread, - }) + match daemon_communication { + DaemonCommunication::Shmem { + daemon_control_region_id, + daemon_events_region_id, + } => { + let control_channel = + ControlChannel::init(dataflow_id, node_id, daemon_control_region_id) + .wrap_err("failed to init control stream")?; + + let (event_stream, event_stream_thread) = + EventStream::init(dataflow_id, node_id, daemon_events_region_id) + .wrap_err("failed to init event stream")?; + + Ok(Self { + control_channel, + event_stream, + event_stream_thread, + }) + } + DaemonCommunication::Tcp { socket_addr } => todo!(), + } } } diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index d21ed053..8bb22514 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -39,21 +39,15 @@ impl DoraNode { dataflow_id, node_id, run_config, - daemon_control_region_id, - daemon_events_region_id, + daemon_communication, } = node_config; let DaemonConnection { control_channel, event_stream, event_stream_thread, - } = DaemonConnection::init( - dataflow_id, - &node_id, - &daemon_control_region_id, - &daemon_events_region_id, - ) - .wrap_err("failed to connect to dora-daemon")?; + } = DaemonConnection::init(dataflow_id, &node_id, &daemon_communication) + .wrap_err("failed to connect to dora-daemon")?; let node = Self { id: node_id, diff --git a/binaries/cli/src/check.rs b/binaries/cli/src/check.rs index 51e280d7..6d164ea8 100644 --- a/binaries/cli/src/check.rs +++ b/binaries/cli/src/check.rs @@ -6,7 +6,7 @@ use dora_core::{ topics::ControlRequest, }; use eyre::{bail, eyre, Context}; -use std::{env::consts::EXE_EXTENSION, io::Write, path::Path, str::FromStr}; +use std::{env::consts::EXE_EXTENSION, io::Write, path::Path}; use termcolor::{Color, ColorChoice, ColorSpec, WriteColor}; pub fn check_environment() -> eyre::Result<()> { diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index f3da0ecf..b0e331dc 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -65,6 +65,7 @@ pub async fn spawn_dataflow( dataflow_id: uuid, working_dir, nodes, + daemon_communication: descriptor.daemon_config, }; let message = serde_json::to_vec(&DaemonCoordinatorEvent::Spawn(spawn_command))?; diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 50928b6c..67470957 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -24,3 +24,4 @@ futures = "0.3.25" clap = { version = "3.1.8", features = ["derive"] } shared-memory-server = { path = "../../libraries/shared-memory-server" } ctrlc = "3.2.5" +bincode = "1.3.3" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index b46b8fe2..c2824980 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -3,8 +3,8 @@ use dora_core::{ config::{DataId, InputMapping, NodeId}, coordinator_messages::DaemonEvent, daemon_messages::{ - self, DaemonCoordinatorEvent, DaemonCoordinatorReply, DaemonReply, DataflowId, DropToken, - SpawnDataflowNodes, + self, DaemonCommunicationConfig, DaemonCoordinatorEvent, DaemonCoordinatorReply, + DaemonReply, DataflowId, DropToken, SpawnDataflowNodes, }, descriptor::{CoreNodeKind, Descriptor, ResolvedNode}, }; @@ -68,12 +68,14 @@ impl Daemon { .ok_or_else(|| eyre::eyre!("canonicalized dataflow path has no parent"))? .to_owned(); - let nodes = read_descriptor(dataflow_path).await?.resolve_aliases(); + let descriptor = read_descriptor(dataflow_path).await?; + let nodes = descriptor.resolve_aliases(); let spawn_command = SpawnDataflowNodes { dataflow_id: Uuid::new_v4(), working_dir, nodes, + daemon_communication: descriptor.daemon_config, }; let exit_when_done = spawn_command @@ -234,8 +236,11 @@ impl Daemon { dataflow_id, working_dir, nodes, + daemon_communication, }) => { - let result = self.spawn_dataflow(dataflow_id, working_dir, nodes).await; + let result = self + .spawn_dataflow(dataflow_id, working_dir, nodes, daemon_communication) + .await; if let Err(err) = &result { tracing::error!("{err:?}"); } @@ -276,6 +281,7 @@ impl Daemon { dataflow_id: uuid::Uuid, working_dir: PathBuf, nodes: Vec, + daemon_communication_config: DaemonCommunicationConfig, ) -> eyre::Result<()> { let dataflow = match self.running.entry(dataflow_id) { std::collections::hash_map::Entry::Vacant(entry) => entry.insert(Default::default()), @@ -318,6 +324,7 @@ impl Daemon { node, self.events_tx.clone(), self.shared_memory_handler_node.clone(), + daemon_communication_config, ) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; diff --git a/binaries/daemon/src/listener/mod.rs b/binaries/daemon/src/listener/mod.rs index 9e12a2de..70fbb7d5 100644 --- a/binaries/daemon/src/listener/mod.rs +++ b/binaries/daemon/src/listener/mod.rs @@ -1,298 +1,2 @@ -use std::collections::VecDeque; - -use crate::{shared_mem_handler, DaemonNodeEvent, Event}; -use dora_core::{ - config::NodeId, - daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, -}; -use eyre::{eyre, Context}; -use shared_memory_server::ShmemServer; -use tokio::sync::{mpsc, oneshot}; - -#[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] -pub fn listener_loop( - mut server: ShmemServer, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, -) { - // receive the first message - let message = match server - .listen() - .wrap_err("failed to receive register message") - { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!("channel disconnected before register message"); - return; - } // disconnected - Err(err) => { - tracing::info!("{err:?}"); - return; - } - }; - - match message { - DaemonRequest::Register { - dataflow_id, - node_id, - } => { - let reply = DaemonReply::Result(Ok(())); - match server - .send_reply(&reply) - .wrap_err("failed to send register reply") - { - Ok(()) => { - let mut listener = Listener { - dataflow_id, - node_id, - server, - daemon_tx, - shmem_handler_tx, - subscribed_events: None, - max_queue_len: 10, // TODO: make this configurable - queue: VecDeque::new(), - }; - match listener.run().wrap_err("listener failed") { - Ok(()) => {} - Err(err) => tracing::error!("{err:?}"), - } - } - Err(err) => { - tracing::warn!("{err:?}"); - } - } - } - _ => { - let reply = DaemonReply::Result(Err("must send register message first".into())); - if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { - tracing::warn!("{err:?}"); - } - } - } -} - -struct Listener { - dataflow_id: DataflowId, - node_id: NodeId, - server: ShmemServer, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, - subscribed_events: Option>, - max_queue_len: usize, - queue: VecDeque, -} - -impl Listener { - fn run(&mut self) -> eyre::Result<()> { - loop { - // receive the next node message - let message = match self - .server - .listen() - .wrap_err("failed to receive DaemonRequest") - { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!( - "channel disconnected: {}/{}", - self.dataflow_id, - self.node_id - ); - break; - } // disconnected - Err(err) => { - tracing::warn!("{err:?}"); - continue; - } - }; - - // handle incoming events - self.handle_events()?; - - self.handle_message(message)?; - } - Ok(()) - } - - fn handle_events(&mut self) -> eyre::Result<()> { - if let Some(events) = &mut self.subscribed_events { - while let Ok(event) = events.try_recv() { - self.queue.push_back(event); - } - - // drop oldest input events to maintain max queue length queue - let input_event_count = self - .queue - .iter() - .filter(|e| matches!(e, NodeEvent::Input { .. })) - .count(); - let drop_n = input_event_count.saturating_sub(self.max_queue_len); - self.drop_oldest_inputs(drop_n)?; - } - Ok(()) - } - - fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { - let mut drop_tokens = Vec::new(); - for i in 0..number { - // find index of oldest input event - let index = self - .queue - .iter() - .position(|e| matches!(e, NodeEvent::Input { .. })) - .expect(&format!("no input event found in drop iteration {i}")); - - // remove that event - if let Some(event) = self.queue.remove(index) { - if let NodeEvent::Input { - data: Some(data), .. - } = event - { - drop_tokens.push(data.drop_token); - } - } - } - self.report_drop_tokens(drop_tokens)?; - Ok(()) - } - - fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { - match message { - DaemonRequest::Register { .. } => { - let reply = DaemonReply::Result(Err("unexpected register message".into())); - self.send_reply(&reply)?; - } - DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped)?, - DaemonRequest::CloseOutputs(outputs) => { - self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs))? - } - DaemonRequest::PrepareOutputMessage { - output_id, - metadata, - data_len, - } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - output_id, - metadata, - data_len, - reply_sender, - }; - self.send_shared_memory_event(event)?; - let reply = reply - .blocking_recv() - .wrap_err("failed to receive prepare output reply")?; - // tracing::debug!("prepare latency: {:?}", start.elapsed()?); - self.send_reply(&reply)?; - } - DaemonRequest::SendPreparedMessage { id } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; - self.send_shared_memory_event(event)?; - self.send_reply( - &reply - .blocking_recv() - .wrap_err("failed to receive send output reply")?, - )?; - } - DaemonRequest::SendEmptyMessage { - output_id, - metadata, - } => { - // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; - // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); - let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - output_id, - metadata, - data: None, - }); - let result = self - .send_daemon_event(event) - .map_err(|_| "failed to receive send_empty_message reply".to_owned()); - self.send_reply(&DaemonReply::Result(result))?; - } - DaemonRequest::Subscribe => { - let (tx, rx) = flume::bounded(100); - self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; - self.subscribed_events = Some(rx); - } - DaemonRequest::NextEvent { drop_tokens } => { - self.report_drop_tokens(drop_tokens)?; - - // try to take the latest queued event first - let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); - let reply = queued_event.unwrap_or_else(|| { - match self.subscribed_events.as_mut() { - // wait for next event - Some(events) => match events.recv() { - Ok(event) => DaemonReply::NodeEvent(event), - Err(flume::RecvError::Disconnected) => DaemonReply::Closed, - }, - None => { - DaemonReply::Result(Err("Ignoring event request because no subscribe \ - message was sent yet" - .into())) - } - } - }); - - self.send_reply(&reply)?; - } - } - Ok(()) - } - - fn report_drop_tokens( - &mut self, - drop_tokens: Vec, - ) -> eyre::Result<()> { - if !drop_tokens.is_empty() { - let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { - tokens: drop_tokens, - }); - self.send_shared_memory_event(drop_event)?; - } - Ok(()) - } - - fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { - // send NodeEvent to daemon main loop - let (reply_tx, reply) = oneshot::channel(); - let event = Event::Node { - dataflow_id: self.dataflow_id.clone(), - node_id: self.node_id.clone(), - event, - reply_sender: reply_tx, - }; - self.daemon_tx - .blocking_send(event) - .map_err(|_| eyre!("failed to send event to daemon"))?; - let reply = reply - .blocking_recv() - .map_err(|_| eyre!("failed to receive reply from daemon"))?; - self.send_reply(&reply)?; - Ok(()) - } - - fn send_reply(&mut self, reply: &DaemonReply) -> eyre::Result<()> { - self.server - .send_reply(&reply) - .wrap_err("failed to send reply to node") - } - - fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { - self.shmem_handler_tx - .send(event) - .map_err(|_| eyre!("failed to send event to shared_mem_handler")) - } - - fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { - self.daemon_tx - .blocking_send(event) - .map_err(|_| eyre!("failed to send event to daemon")) - } -} +pub mod shmem; +pub mod tcp; diff --git a/binaries/daemon/src/listener/shmem.rs b/binaries/daemon/src/listener/shmem.rs new file mode 100644 index 00000000..9e12a2de --- /dev/null +++ b/binaries/daemon/src/listener/shmem.rs @@ -0,0 +1,298 @@ +use std::collections::VecDeque; + +use crate::{shared_mem_handler, DaemonNodeEvent, Event}; +use dora_core::{ + config::NodeId, + daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, +}; +use eyre::{eyre, Context}; +use shared_memory_server::ShmemServer; +use tokio::sync::{mpsc, oneshot}; + +#[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] +pub fn listener_loop( + mut server: ShmemServer, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, +) { + // receive the first message + let message = match server + .listen() + .wrap_err("failed to receive register message") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!("channel disconnected before register message"); + return; + } // disconnected + Err(err) => { + tracing::info!("{err:?}"); + return; + } + }; + + match message { + DaemonRequest::Register { + dataflow_id, + node_id, + } => { + let reply = DaemonReply::Result(Ok(())); + match server + .send_reply(&reply) + .wrap_err("failed to send register reply") + { + Ok(()) => { + let mut listener = Listener { + dataflow_id, + node_id, + server, + daemon_tx, + shmem_handler_tx, + subscribed_events: None, + max_queue_len: 10, // TODO: make this configurable + queue: VecDeque::new(), + }; + match listener.run().wrap_err("listener failed") { + Ok(()) => {} + Err(err) => tracing::error!("{err:?}"), + } + } + Err(err) => { + tracing::warn!("{err:?}"); + } + } + } + _ => { + let reply = DaemonReply::Result(Err("must send register message first".into())); + if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { + tracing::warn!("{err:?}"); + } + } + } +} + +struct Listener { + dataflow_id: DataflowId, + node_id: NodeId, + server: ShmemServer, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, + subscribed_events: Option>, + max_queue_len: usize, + queue: VecDeque, +} + +impl Listener { + fn run(&mut self) -> eyre::Result<()> { + loop { + // receive the next node message + let message = match self + .server + .listen() + .wrap_err("failed to receive DaemonRequest") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!( + "channel disconnected: {}/{}", + self.dataflow_id, + self.node_id + ); + break; + } // disconnected + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle incoming events + self.handle_events()?; + + self.handle_message(message)?; + } + Ok(()) + } + + fn handle_events(&mut self) -> eyre::Result<()> { + if let Some(events) = &mut self.subscribed_events { + while let Ok(event) = events.try_recv() { + self.queue.push_back(event); + } + + // drop oldest input events to maintain max queue length queue + let input_event_count = self + .queue + .iter() + .filter(|e| matches!(e, NodeEvent::Input { .. })) + .count(); + let drop_n = input_event_count.saturating_sub(self.max_queue_len); + self.drop_oldest_inputs(drop_n)?; + } + Ok(()) + } + + fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { + let mut drop_tokens = Vec::new(); + for i in 0..number { + // find index of oldest input event + let index = self + .queue + .iter() + .position(|e| matches!(e, NodeEvent::Input { .. })) + .expect(&format!("no input event found in drop iteration {i}")); + + // remove that event + if let Some(event) = self.queue.remove(index) { + if let NodeEvent::Input { + data: Some(data), .. + } = event + { + drop_tokens.push(data.drop_token); + } + } + } + self.report_drop_tokens(drop_tokens)?; + Ok(()) + } + + fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { + match message { + DaemonRequest::Register { .. } => { + let reply = DaemonReply::Result(Err("unexpected register message".into())); + self.send_reply(&reply)?; + } + DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped)?, + DaemonRequest::CloseOutputs(outputs) => { + self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs))? + } + DaemonRequest::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data_len, + reply_sender, + }; + self.send_shared_memory_event(event)?; + let reply = reply + .blocking_recv() + .wrap_err("failed to receive prepare output reply")?; + // tracing::debug!("prepare latency: {:?}", start.elapsed()?); + self.send_reply(&reply)?; + } + DaemonRequest::SendPreparedMessage { id } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; + self.send_shared_memory_event(event)?; + self.send_reply( + &reply + .blocking_recv() + .wrap_err("failed to receive send output reply")?, + )?; + } + DaemonRequest::SendEmptyMessage { + output_id, + metadata, + } => { + // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; + // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); + let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data: None, + }); + let result = self + .send_daemon_event(event) + .map_err(|_| "failed to receive send_empty_message reply".to_owned()); + self.send_reply(&DaemonReply::Result(result))?; + } + DaemonRequest::Subscribe => { + let (tx, rx) = flume::bounded(100); + self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; + self.subscribed_events = Some(rx); + } + DaemonRequest::NextEvent { drop_tokens } => { + self.report_drop_tokens(drop_tokens)?; + + // try to take the latest queued event first + let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); + let reply = queued_event.unwrap_or_else(|| { + match self.subscribed_events.as_mut() { + // wait for next event + Some(events) => match events.recv() { + Ok(event) => DaemonReply::NodeEvent(event), + Err(flume::RecvError::Disconnected) => DaemonReply::Closed, + }, + None => { + DaemonReply::Result(Err("Ignoring event request because no subscribe \ + message was sent yet" + .into())) + } + } + }); + + self.send_reply(&reply)?; + } + } + Ok(()) + } + + fn report_drop_tokens( + &mut self, + drop_tokens: Vec, + ) -> eyre::Result<()> { + if !drop_tokens.is_empty() { + let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { + tokens: drop_tokens, + }); + self.send_shared_memory_event(drop_event)?; + } + Ok(()) + } + + fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { + // send NodeEvent to daemon main loop + let (reply_tx, reply) = oneshot::channel(); + let event = Event::Node { + dataflow_id: self.dataflow_id.clone(), + node_id: self.node_id.clone(), + event, + reply_sender: reply_tx, + }; + self.daemon_tx + .blocking_send(event) + .map_err(|_| eyre!("failed to send event to daemon"))?; + let reply = reply + .blocking_recv() + .map_err(|_| eyre!("failed to receive reply from daemon"))?; + self.send_reply(&reply)?; + Ok(()) + } + + fn send_reply(&mut self, reply: &DaemonReply) -> eyre::Result<()> { + self.server + .send_reply(&reply) + .wrap_err("failed to send reply to node") + } + + fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { + self.shmem_handler_tx + .send(event) + .map_err(|_| eyre!("failed to send event to shared_mem_handler")) + } + + fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { + self.daemon_tx + .blocking_send(event) + .map_err(|_| eyre!("failed to send event to daemon")) + } +} diff --git a/binaries/daemon/src/listener/tcp.rs b/binaries/daemon/src/listener/tcp.rs new file mode 100644 index 00000000..c34b3aa9 --- /dev/null +++ b/binaries/daemon/src/listener/tcp.rs @@ -0,0 +1,368 @@ +use std::collections::VecDeque; + +use crate::{ + shared_mem_handler, + tcp_utils::{tcp_receive, tcp_send}, + DaemonNodeEvent, Event, +}; +use dora_core::{ + config::NodeId, + daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, +}; +use eyre::{eyre, Context}; +use tokio::{ + net::{TcpListener, TcpStream}, + sync::{mpsc, oneshot}, +}; + +#[tracing::instrument(skip(listener, daemon_tx, shmem_handler_tx))] +pub async fn listener_loop( + listener: TcpListener, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, +) { + loop { + match listener + .accept() + .await + .wrap_err("failed to accept new connection") + { + Err(err) => { + tracing::info!("{err}"); + } + Ok((connection, _)) => { + tokio::spawn(handle_connection_loop( + connection, + daemon_tx.clone(), + shmem_handler_tx.clone(), + )); + } + } + } +} + +#[tracing::instrument(skip(connection, daemon_tx, shmem_handler_tx))] +pub async fn handle_connection_loop( + mut connection: TcpStream, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, +) { + if let Err(err) = connection.set_nodelay(true) { + tracing::warn!("failed to set nodelay for connection: {err}"); + } + + // receive the first message + let message = match receive_message(&mut connection) + .await + .wrap_err("failed to receive register message") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!("channel disconnected before register message"); + return; + } // disconnected + Err(err) => { + tracing::info!("{err:?}"); + return; + } + }; + + match message { + DaemonRequest::Register { + dataflow_id, + node_id, + } => { + let reply = DaemonReply::Result(Ok(())); + match send_reply(&mut connection, &reply) + .await + .wrap_err("failed to send register reply") + { + Ok(()) => { + let mut listener = Listener { + dataflow_id, + node_id, + connection, + daemon_tx, + shmem_handler_tx, + subscribed_events: None, + max_queue_len: 10, // TODO: make this configurable + queue: VecDeque::new(), + }; + match listener.run().await.wrap_err("listener failed") { + Ok(()) => {} + Err(err) => tracing::error!("{err:?}"), + } + } + Err(err) => { + tracing::warn!("{err:?}"); + } + } + } + _ => { + let reply = DaemonReply::Result(Err("must send register message first".into())); + if let Err(err) = send_reply(&mut connection, &reply) + .await + .wrap_err("failed to send reply") + { + tracing::warn!("{err:?}"); + } + } + } +} + +async fn receive_message(connection: &mut TcpStream) -> eyre::Result> { + let raw = match tcp_receive(connection).await { + Ok(raw) => raw, + Err(err) => match err.kind() { + std::io::ErrorKind::UnexpectedEof | std::io::ErrorKind::ConnectionAborted => { + return Ok(None) + } + other => { + return Err(err) + .context("unexpected I/O error while trying to receive DaemonRequest") + } + }, + }; + bincode::deserialize(&raw) + .wrap_err("failed to deserialize DaemonRequest") + .map(Some) +} + +async fn send_reply(connection: &mut TcpStream, message: &DaemonReply) -> eyre::Result<()> { + let serialized = bincode::serialize(&message).wrap_err("failed to serialize DaemonReply")?; + tcp_send(connection, &serialized) + .await + .wrap_err("failed to send DaemonReply")?; + Ok(()) +} + +struct Listener { + dataflow_id: DataflowId, + node_id: NodeId, + connection: TcpStream, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, + subscribed_events: Option>, + max_queue_len: usize, + queue: VecDeque, +} + +impl Listener { + async fn run(&mut self) -> eyre::Result<()> { + loop { + // receive the next node message + let message = match receive_message(&mut self.connection) + .await + .wrap_err("failed to receive DaemonRequest") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!( + "channel disconnected: {}/{}", + self.dataflow_id, + self.node_id + ); + break; + } // disconnected + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle incoming events + self.handle_events()?; + + self.handle_message(message).await?; + } + Ok(()) + } + + fn handle_events(&mut self) -> eyre::Result<()> { + if let Some(events) = &mut self.subscribed_events { + while let Ok(event) = events.try_recv() { + self.queue.push_back(event); + } + + // drop oldest input events to maintain max queue length queue + let input_event_count = self + .queue + .iter() + .filter(|e| matches!(e, NodeEvent::Input { .. })) + .count(); + let drop_n = input_event_count.saturating_sub(self.max_queue_len); + self.drop_oldest_inputs(drop_n)?; + } + Ok(()) + } + + fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { + let mut drop_tokens = Vec::new(); + for i in 0..number { + // find index of oldest input event + let index = self + .queue + .iter() + .position(|e| matches!(e, NodeEvent::Input { .. })) + .expect(&format!("no input event found in drop iteration {i}")); + + // remove that event + if let Some(event) = self.queue.remove(index) { + if let NodeEvent::Input { + data: Some(data), .. + } = event + { + drop_tokens.push(data.drop_token); + } + } + } + self.report_drop_tokens(drop_tokens)?; + Ok(()) + } + + async fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { + match message { + DaemonRequest::Register { .. } => { + let reply = DaemonReply::Result(Err("unexpected register message".into())); + self.send_reply(&reply).await?; + } + DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped).await?, + DaemonRequest::CloseOutputs(outputs) => { + self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs)) + .await? + } + DaemonRequest::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data_len, + reply_sender, + }; + self.send_shared_memory_event(event)?; + let reply = reply + .await + .wrap_err("failed to receive prepare output reply")?; + // tracing::debug!("prepare latency: {:?}", start.elapsed()?); + self.send_reply(&reply).await?; + } + DaemonRequest::SendPreparedMessage { id } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; + self.send_shared_memory_event(event)?; + self.send_reply( + &reply + .await + .wrap_err("failed to receive send output reply")?, + ) + .await?; + } + DaemonRequest::SendEmptyMessage { + output_id, + metadata, + } => { + // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; + // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); + let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data: None, + }); + let result = self + .send_daemon_event(event) + .await + .map_err(|_| "failed to receive send_empty_message reply".to_owned()); + self.send_reply(&DaemonReply::Result(result)).await?; + } + DaemonRequest::Subscribe => { + let (tx, rx) = flume::bounded(100); + self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx }) + .await?; + self.subscribed_events = Some(rx); + } + DaemonRequest::NextEvent { drop_tokens } => { + self.report_drop_tokens(drop_tokens)?; + + // try to take the latest queued event first + let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); + let reply = queued_event.unwrap_or_else(|| { + match self.subscribed_events.as_mut() { + // wait for next event + Some(events) => match events.recv() { + Ok(event) => DaemonReply::NodeEvent(event), + Err(flume::RecvError::Disconnected) => DaemonReply::Closed, + }, + None => { + DaemonReply::Result(Err("Ignoring event request because no subscribe \ + message was sent yet" + .into())) + } + } + }); + + self.send_reply(&reply).await?; + } + } + Ok(()) + } + + fn report_drop_tokens( + &mut self, + drop_tokens: Vec, + ) -> eyre::Result<()> { + if !drop_tokens.is_empty() { + let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { + tokens: drop_tokens, + }); + self.send_shared_memory_event(drop_event)?; + } + Ok(()) + } + + async fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { + // send NodeEvent to daemon main loop + let (reply_tx, reply) = oneshot::channel(); + let event = Event::Node { + dataflow_id: self.dataflow_id.clone(), + node_id: self.node_id.clone(), + event, + reply_sender: reply_tx, + }; + self.daemon_tx + .send(event) + .await + .map_err(|_| eyre!("failed to send event to daemon"))?; + let reply = reply + .await + .map_err(|_| eyre!("failed to receive reply from daemon"))?; + self.send_reply(&reply).await?; + Ok(()) + } + + async fn send_reply(&mut self, reply: &DaemonReply) -> eyre::Result<()> { + send_reply(&mut self.connection, reply) + .await + .wrap_err("failed to send reply to node") + } + + fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { + self.shmem_handler_tx + .send(event) + .map_err(|_| eyre!("failed to send event to shared_mem_handler")) + } + + async fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { + self.daemon_tx + .send(event) + .await + .map_err(|_| eyre!("failed to send event to daemon")) + } +} diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 6616b30c..de016221 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,17 +1,18 @@ use crate::{ - listener::listener_loop, runtime_node_inputs, runtime_node_outputs, shared_mem_handler, - DoraEvent, Event, + listener, runtime_node_inputs, runtime_node_outputs, shared_mem_handler, DoraEvent, Event, }; use dora_core::{ - config::NodeRunConfig, - daemon_messages::{DataflowId, NodeConfig, RuntimeConfig}, + config::{NodeId, NodeRunConfig}, + daemon_messages::{ + DaemonCommunication, DaemonCommunicationConfig, DataflowId, NodeConfig, RuntimeConfig, + }, descriptor::{resolve_path, source_is_url, OperatorSource, ResolvedNode}, }; use dora_download::download_file; use eyre::{eyre, WrapErr}; use shared_memory_server::{ShmemConf, ShmemServer}; -use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; -use tokio::sync::mpsc; +use std::{env::consts::EXE_EXTENSION, net::Ipv4Addr, path::Path, process::Stdio}; +use tokio::{net::TcpListener, sync::mpsc}; pub async fn spawn_node( dataflow_id: DataflowId, @@ -19,38 +20,19 @@ pub async fn spawn_node( node: ResolvedNode, daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, + config: DaemonCommunicationConfig, ) -> eyre::Result<()> { let node_id = node.id.clone(); tracing::debug!("Spawning node `{dataflow_id}/{node_id}`"); - let daemon_control_region = ShmemConf::new() - .size(4096) - .create() - .wrap_err("failed to allocate daemon_control_region")?; - let daemon_events_region = ShmemConf::new() - .size(4096) - .create() - .wrap_err("failed to allocate daemon_events_region")?; - let daemon_control_region_id = daemon_control_region.get_os_id().to_owned(); - let daemon_events_region_id = daemon_events_region.get_os_id().to_owned(); - { - let server = unsafe { ShmemServer::new(daemon_control_region) } - .wrap_err("failed to create control server")?; - let daemon_tx = daemon_tx.clone(); - let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::task::spawn_blocking(move || listener_loop(server, daemon_tx, shmem_handler_tx)); - } - { - let server = unsafe { ShmemServer::new(daemon_events_region) } - .wrap_err("failed to create events server")?; - let event_loop_node_id = format!("{dataflow_id}/{node_id}"); - let daemon_tx = daemon_tx.clone(); - let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::task::spawn_blocking(move || { - listener_loop(server, daemon_tx, shmem_handler_tx); - tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); - }); - } + let daemon_communication = daemon_communication_config( + &dataflow_id, + &node_id, + &daemon_tx, + &shmem_handler_tx, + config, + ) + .await?; let mut child = match node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { @@ -64,7 +46,7 @@ pub async fn spawn_node( .wrap_err("failed to download custom node")?; target_path.clone() } else { - resolve_path(&n.source, &working_dir) + resolve_path(&n.source, working_dir) .wrap_err_with(|| format!("failed to resolve node source `{}`", n.source))? }; @@ -76,8 +58,7 @@ pub async fn spawn_node( dataflow_id, node_id: node_id.clone(), run_config: n.run_config.clone(), - daemon_control_region_id, - daemon_events_region_id, + daemon_communication, }; if let Some(args) = &n.args { command.args(args.split_ascii_whitespace()); @@ -133,8 +114,7 @@ pub async fn spawn_node( inputs: runtime_node_inputs(&n), outputs: runtime_node_outputs(&n), }, - daemon_control_region_id, - daemon_events_region_id, + daemon_communication, }, operators: n.operators, }; @@ -170,3 +150,69 @@ pub async fn spawn_node( }); Ok(()) } + +async fn daemon_communication_config( + dataflow_id: &DataflowId, + node_id: &NodeId, + daemon_tx: &mpsc::Sender, + shmem_handler_tx: &flume::Sender, + config: DaemonCommunicationConfig, +) -> eyre::Result { + match config { + DaemonCommunicationConfig::Tcp => { + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let socket = match TcpListener::bind((localhost, 0)).await { + Ok(socket) => socket, + Err(err) => { + return Err( + eyre::Report::new(err).wrap_err("failed to create local TCP listener") + ) + } + }; + let socket_addr = socket + .local_addr() + .wrap_err("failed to get local addr of socket")?; + + Ok(DaemonCommunication::Tcp { socket_addr }) + } + DaemonCommunicationConfig::Shmem => { + let daemon_control_region = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to allocate daemon_control_region")?; + let daemon_events_region = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to allocate daemon_events_region")?; + let daemon_control_region_id = daemon_control_region.get_os_id().to_owned(); + let daemon_events_region_id = daemon_events_region.get_os_id().to_owned(); + + { + let server = unsafe { ShmemServer::new(daemon_control_region) } + .wrap_err("failed to create control server")?; + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::task::spawn_blocking(move || { + listener::shmem::listener_loop(server, daemon_tx, shmem_handler_tx) + }); + } + + { + let server = unsafe { ShmemServer::new(daemon_events_region) } + .wrap_err("failed to create events server")?; + let event_loop_node_id = format!("{dataflow_id}/{node_id}"); + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::task::spawn_blocking(move || { + listener::shmem::listener_loop(server, daemon_tx, shmem_handler_tx); + tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); + }); + } + + Ok(DaemonCommunication::Shmem { + daemon_control_region_id, + daemon_events_region_id, + }) + } + } +} diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index 1013f0ce..c1bb4714 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -1,8 +1,8 @@ -use std::{collections::BTreeMap, path::PathBuf}; +use std::{net::SocketAddr, path::PathBuf}; use crate::{ config::{DataId, NodeId, NodeRunConfig}, - descriptor::{self, OperatorDefinition, ResolvedNode}, + descriptor::{OperatorDefinition, ResolvedNode}, }; use dora_message::Metadata; use uuid::Uuid; @@ -12,8 +12,18 @@ pub struct NodeConfig { pub dataflow_id: DataflowId, pub node_id: NodeId, pub run_config: NodeRunConfig, - pub daemon_control_region_id: SharedMemoryId, - pub daemon_events_region_id: SharedMemoryId, + pub daemon_communication: DaemonCommunication, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub enum DaemonCommunication { + Shmem { + daemon_control_region_id: SharedMemoryId, + daemon_events_region_id: SharedMemoryId, + }, + Tcp { + socket_addr: SocketAddr, + }, } #[derive(Debug, serde::Serialize, serde::Deserialize)] @@ -117,4 +127,17 @@ pub struct SpawnDataflowNodes { pub dataflow_id: DataflowId, pub working_dir: PathBuf, pub nodes: Vec, + pub daemon_communication: DaemonCommunicationConfig, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum DaemonCommunicationConfig { + Tcp, + Shmem, +} + +impl Default for DaemonCommunicationConfig { + fn default() -> Self { + Self::Shmem // TODO change to TCP + } } diff --git a/libraries/core/src/descriptor/mod.rs b/libraries/core/src/descriptor/mod.rs index 238ea75b..41cbde15 100644 --- a/libraries/core/src/descriptor/mod.rs +++ b/libraries/core/src/descriptor/mod.rs @@ -1,4 +1,7 @@ -use crate::config::{CommunicationConfig, DataId, InputMapping, NodeId, NodeRunConfig, OperatorId}; +use crate::{ + config::{CommunicationConfig, DataId, InputMapping, NodeId, NodeRunConfig, OperatorId}, + daemon_messages::DaemonCommunicationConfig, +}; use eyre::{bail, Result}; use serde::{Deserialize, Serialize}; use std::{ @@ -18,6 +21,8 @@ pub struct Descriptor { #[serde(with = "serde_yaml::with::singleton_map")] pub communication: CommunicationConfig, pub nodes: Vec, + #[serde(default)] + pub daemon_config: DaemonCommunicationConfig, } pub const SINGLE_OPERATOR_DEFAULT_ID: &str = "op"; From 0840cf0c0ef61b258c3d74b401f27bfe295bc5b7 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 11:42:21 +0100 Subject: [PATCH 140/225] Implement support for TCP-based daemon connection in Rust node API --- Cargo.lock | 1 + apis/rust/node/Cargo.toml | 1 + .../node/src/{daemon.rs => daemon/mod.rs} | 111 +++++++++++------- apis/rust/node/src/daemon/tcp.rs | 57 +++++++++ 4 files changed, 130 insertions(+), 40 deletions(-) rename apis/rust/node/src/{daemon.rs => daemon/mod.rs} (81%) create mode 100644 apis/rust/node/src/daemon/tcp.rs diff --git a/Cargo.lock b/Cargo.lock index 58af7d2b..696e6a19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1080,6 +1080,7 @@ dependencies = [ name = "dora-node-api" version = "0.1.2" dependencies = [ + "bincode", "capnp", "dora-core", "dora-message", diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index e62185f4..0775b20d 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -23,6 +23,7 @@ capnp = "0.14.11" dora-message = { path = "../../../libraries/message" } dora-core = { path = "../../../libraries/core" } shared-memory-server = { path = "../../../libraries/shared-memory-server" } +bincode = "1.3.3" [dev-dependencies] tokio = { version = "1.17.0", features = ["rt"] } diff --git a/apis/rust/node/src/daemon.rs b/apis/rust/node/src/daemon/mod.rs similarity index 81% rename from apis/rust/node/src/daemon.rs rename to apis/rust/node/src/daemon/mod.rs index 2feae668..0b2ef9f2 100644 --- a/apis/rust/node/src/daemon.rs +++ b/apis/rust/node/src/daemon/mod.rs @@ -5,9 +5,11 @@ use dora_core::{ use dora_message::Metadata; use eyre::{bail, eyre, Context}; use shared_memory_server::{Shmem, ShmemClient, ShmemConf}; -use std::{marker::PhantomData, thread::JoinHandle, time::Duration}; +use std::{marker::PhantomData, net::TcpStream, thread::JoinHandle, time::Duration}; -pub struct DaemonConnection { +mod tcp; + +pub(crate) struct DaemonConnection { pub control_channel: ControlChannel, pub event_stream: EventStream, pub(crate) event_stream_thread: JoinHandle<()>, @@ -19,49 +21,53 @@ impl DaemonConnection { node_id: &NodeId, daemon_communication: &DaemonCommunication, ) -> eyre::Result { - match daemon_communication { + let (control, events) = match daemon_communication { DaemonCommunication::Shmem { daemon_control_region_id, daemon_events_region_id, } => { - let control_channel = - ControlChannel::init(dataflow_id, node_id, daemon_control_region_id) - .wrap_err("failed to init control stream")?; - - let (event_stream, event_stream_thread) = - EventStream::init(dataflow_id, node_id, daemon_events_region_id) - .wrap_err("failed to init event stream")?; - - Ok(Self { - control_channel, - event_stream, - event_stream_thread, - }) + let control = unsafe { DaemonChannel::new_shmem(daemon_control_region_id) } + .wrap_err("failed to create shmem control channel")?; + let events = unsafe { DaemonChannel::new_shmem(daemon_events_region_id) } + .wrap_err("failed to create shmem event channel")?; + (control, events) } - DaemonCommunication::Tcp { socket_addr } => todo!(), - } + DaemonCommunication::Tcp { socket_addr } => { + let control = DaemonChannel::new_tcp( + TcpStream::connect(socket_addr).wrap_err("failed to connect control stream")?, + )?; + let events = DaemonChannel::new_tcp( + TcpStream::connect(socket_addr).wrap_err("failed to connect event stream")?, + )?; + (control, events) + } + }; + + let control_channel = ControlChannel::init(dataflow_id, node_id, control) + .wrap_err("failed to init control stream")?; + + let (event_stream, event_stream_thread) = EventStream::init(dataflow_id, node_id, events) + .wrap_err("failed to init event stream")?; + + Ok(Self { + control_channel, + event_stream, + event_stream_thread, + }) } } -pub struct ControlChannel { - channel: ShmemClient, +pub(crate) struct ControlChannel { + channel: DaemonChannel, } impl ControlChannel { - #[tracing::instrument] + #[tracing::instrument(skip(channel))] fn init( dataflow_id: DataflowId, node_id: &NodeId, - daemon_control_region_id: &str, + mut channel: DaemonChannel, ) -> eyre::Result { - let daemon_events_region = ShmemConf::new() - .os_id(daemon_control_region_id) - .open() - .wrap_err("failed to connect to dora-daemon")?; - let mut channel = - unsafe { ShmemClient::new(daemon_events_region, Some(Duration::from_secs(5))) } - .wrap_err("failed to create ShmemChannel")?; - register(dataflow_id, node_id.clone(), &mut channel)?; Ok(Self { channel }) @@ -154,10 +160,43 @@ impl ControlChannel { } } +enum DaemonChannel { + Shmem(ShmemClient), + Tcp(TcpStream), +} + +impl DaemonChannel { + #[tracing::instrument] + fn new_tcp(stream: TcpStream) -> eyre::Result { + stream.set_nodelay(true).context("failed to set nodelay")?; + Ok(DaemonChannel::Tcp(stream)) + } + + #[tracing::instrument] + unsafe fn new_shmem(daemon_control_region_id: &str) -> eyre::Result { + let daemon_events_region = ShmemConf::new() + .os_id(daemon_control_region_id) + .open() + .wrap_err("failed to connect to dora-daemon")?; + let channel = DaemonChannel::Shmem( + unsafe { ShmemClient::new(daemon_events_region, Some(Duration::from_secs(5))) } + .wrap_err("failed to create ShmemChannel")?, + ); + Ok(channel) + } + + fn request(&mut self, request: &DaemonRequest) -> eyre::Result { + match self { + DaemonChannel::Shmem(client) => client.request(request), + DaemonChannel::Tcp(stream) => tcp::request(stream, request), + } + } +} + fn register( dataflow_id: DataflowId, node_id: NodeId, - channel: &mut ShmemClient, + channel: &mut DaemonChannel, ) -> eyre::Result<()> { let msg = DaemonRequest::Register { dataflow_id, @@ -186,16 +225,8 @@ impl EventStream { fn init( dataflow_id: DataflowId, node_id: &NodeId, - daemon_events_region_id: &str, + mut channel: DaemonChannel, ) -> eyre::Result<(Self, JoinHandle<()>)> { - let daemon_events_region = ShmemConf::new() - .os_id(daemon_events_region_id) - .open() - .wrap_err("failed to connect to dora-daemon")?; - let mut channel: ShmemClient = - unsafe { ShmemClient::new(daemon_events_region, None) } - .wrap_err("failed to create ShmemChannel")?; - register(dataflow_id, node_id.clone(), &mut channel)?; channel diff --git a/apis/rust/node/src/daemon/tcp.rs b/apis/rust/node/src/daemon/tcp.rs new file mode 100644 index 00000000..da2b749b --- /dev/null +++ b/apis/rust/node/src/daemon/tcp.rs @@ -0,0 +1,57 @@ +use dora_core::daemon_messages::{DaemonReply, DaemonRequest}; +use eyre::{eyre, Context}; +use std::{ + io::{Read, Write}, + net::TcpStream, +}; + +pub fn request(connection: &mut TcpStream, request: &DaemonRequest) -> eyre::Result { + send_message(connection, request)?; + receive_reply(connection) + .and_then(|reply| reply.ok_or_else(|| eyre!("server disconnected unexpectedly"))) +} + +fn send_message(connection: &mut TcpStream, message: &DaemonRequest) -> eyre::Result<()> { + let serialized = bincode::serialize(&message).wrap_err("failed to serialize DaemonRequest")?; + tcp_send(connection, &serialized).wrap_err("failed to send DaemonRequest")?; + Ok(()) +} + +fn receive_reply(connection: &mut TcpStream) -> eyre::Result> { + let raw = match tcp_receive(connection) { + Ok(raw) => raw, + Err(err) => match err.kind() { + std::io::ErrorKind::UnexpectedEof | std::io::ErrorKind::ConnectionAborted => { + return Ok(None) + } + other => { + return Err(err).with_context(|| { + format!( + "unexpected I/O error (kind {other:?}) while trying to receive DaemonReply" + ) + }) + } + }, + }; + bincode::deserialize(&raw) + .wrap_err("failed to deserialize DaemonReply") + .map(Some) +} + +fn tcp_send(connection: &mut (impl Write + Unpin), message: &[u8]) -> std::io::Result<()> { + let len_raw = (message.len() as u64).to_le_bytes(); + connection.write_all(&len_raw)?; + connection.write_all(message)?; + Ok(()) +} + +fn tcp_receive(connection: &mut (impl Read + Unpin)) -> std::io::Result> { + let reply_len = { + let mut raw = [0; 8]; + connection.read_exact(&mut raw)?; + u64::from_le_bytes(raw) as usize + }; + let mut reply = vec![0; reply_len]; + connection.read_exact(&mut reply)?; + Ok(reply) +} From 0c2d916ea72d22b0b405a40c66380d8836c88545 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 11:43:53 +0100 Subject: [PATCH 141/225] Fix: Don't use blocking channel methods in async daemon TCP listener --- binaries/daemon/src/listener/tcp.rs | 55 ++++++++++++++++------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/binaries/daemon/src/listener/tcp.rs b/binaries/daemon/src/listener/tcp.rs index c34b3aa9..09401db4 100644 --- a/binaries/daemon/src/listener/tcp.rs +++ b/binaries/daemon/src/listener/tcp.rs @@ -171,14 +171,14 @@ impl Listener { }; // handle incoming events - self.handle_events()?; + self.handle_events().await?; self.handle_message(message).await?; } Ok(()) } - fn handle_events(&mut self) -> eyre::Result<()> { + async fn handle_events(&mut self) -> eyre::Result<()> { if let Some(events) = &mut self.subscribed_events { while let Ok(event) = events.try_recv() { self.queue.push_back(event); @@ -191,12 +191,12 @@ impl Listener { .filter(|e| matches!(e, NodeEvent::Input { .. })) .count(); let drop_n = input_event_count.saturating_sub(self.max_queue_len); - self.drop_oldest_inputs(drop_n)?; + self.drop_oldest_inputs(drop_n).await?; } Ok(()) } - fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { + async fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { let mut drop_tokens = Vec::new(); for i in 0..number { // find index of oldest input event @@ -216,7 +216,7 @@ impl Listener { } } } - self.report_drop_tokens(drop_tokens)?; + self.report_drop_tokens(drop_tokens).await?; Ok(()) } @@ -245,7 +245,7 @@ impl Listener { data_len, reply_sender, }; - self.send_shared_memory_event(event)?; + self.send_shared_memory_event(event).await?; let reply = reply .await .wrap_err("failed to receive prepare output reply")?; @@ -255,7 +255,7 @@ impl Listener { DaemonRequest::SendPreparedMessage { id } => { let (reply_sender, reply) = oneshot::channel(); let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; - self.send_shared_memory_event(event)?; + self.send_shared_memory_event(event).await?; self.send_reply( &reply .await @@ -289,24 +289,27 @@ impl Listener { self.subscribed_events = Some(rx); } DaemonRequest::NextEvent { drop_tokens } => { - self.report_drop_tokens(drop_tokens)?; + self.report_drop_tokens(drop_tokens).await?; // try to take the latest queued event first let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); - let reply = queued_event.unwrap_or_else(|| { - match self.subscribed_events.as_mut() { - // wait for next event - Some(events) => match events.recv() { - Ok(event) => DaemonReply::NodeEvent(event), - Err(flume::RecvError::Disconnected) => DaemonReply::Closed, - }, - None => { - DaemonReply::Result(Err("Ignoring event request because no subscribe \ - message was sent yet" - .into())) + let reply = match queued_event { + Some(reply) => reply, + None => { + match self.subscribed_events.as_mut() { + // wait for next event + Some(events) => match events.recv_async().await { + Ok(event) => DaemonReply::NodeEvent(event), + Err(flume::RecvError::Disconnected) => DaemonReply::Closed, + }, + None => DaemonReply::Result(Err( + "Ignoring event request because no subscribe \ + message was sent yet" + .into(), + )), } } - }); + }; self.send_reply(&reply).await?; } @@ -314,7 +317,7 @@ impl Listener { Ok(()) } - fn report_drop_tokens( + async fn report_drop_tokens( &mut self, drop_tokens: Vec, ) -> eyre::Result<()> { @@ -322,7 +325,7 @@ impl Listener { let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { tokens: drop_tokens, }); - self.send_shared_memory_event(drop_event)?; + self.send_shared_memory_event(drop_event).await?; } Ok(()) } @@ -353,9 +356,13 @@ impl Listener { .wrap_err("failed to send reply to node") } - fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { + async fn send_shared_memory_event( + &self, + event: shared_mem_handler::NodeEvent, + ) -> eyre::Result<()> { self.shmem_handler_tx - .send(event) + .send_async(event) + .await .map_err(|_| eyre!("failed to send event to shared_mem_handler")) } From feee09c6e5720290cd0489e380d9373a4ee0faac Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 11:45:28 +0100 Subject: [PATCH 142/225] Improve logging --- binaries/daemon/src/lib.rs | 2 +- binaries/daemon/src/listener/tcp.rs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index c2824980..a60db6c0 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -168,7 +168,7 @@ impl Daemon { while let Some(event) = events.next().await { let start = Instant::now(); - let event_debug = format!("{event:?}"); + match event { Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { let (reply, status) = self.handle_coordinator_event(event).await; diff --git a/binaries/daemon/src/listener/tcp.rs b/binaries/daemon/src/listener/tcp.rs index 09401db4..eb1d0876 100644 --- a/binaries/daemon/src/listener/tcp.rs +++ b/binaries/daemon/src/listener/tcp.rs @@ -98,7 +98,8 @@ pub async fn handle_connection_loop( } } } - _ => { + other => { + tracing::warn!("expected register message, got `{other:?}`"); let reply = DaemonReply::Result(Err("must send register message first".into())); if let Err(err) = send_reply(&mut connection, &reply) .await @@ -220,6 +221,7 @@ impl Listener { Ok(()) } + #[tracing::instrument(skip(self), fields(%self.dataflow_id, %self.node_id))] async fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { match message { DaemonRequest::Register { .. } => { From b34af720bb40c0baa90ce60da6074c43513827cc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 11:45:44 +0100 Subject: [PATCH 143/225] Fix some warnings --- binaries/daemon/src/listener/tcp.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/binaries/daemon/src/listener/tcp.rs b/binaries/daemon/src/listener/tcp.rs index eb1d0876..c3ca824f 100644 --- a/binaries/daemon/src/listener/tcp.rs +++ b/binaries/daemon/src/listener/tcp.rs @@ -1,5 +1,3 @@ -use std::collections::VecDeque; - use crate::{ shared_mem_handler, tcp_utils::{tcp_receive, tcp_send}, @@ -10,6 +8,7 @@ use dora_core::{ daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, }; use eyre::{eyre, Context}; +use std::collections::VecDeque; use tokio::{ net::{TcpListener, TcpStream}, sync::{mpsc, oneshot}, @@ -118,7 +117,7 @@ async fn receive_message(connection: &mut TcpStream) -> eyre::Result { return Ok(None) } - other => { + _other => { return Err(err) .context("unexpected I/O error while trying to receive DaemonRequest") } @@ -205,7 +204,7 @@ impl Listener { .queue .iter() .position(|e| matches!(e, NodeEvent::Input { .. })) - .expect(&format!("no input event found in drop iteration {i}")); + .unwrap_or_else(|| panic!("no input event found in drop iteration {i}")); // remove that event if let Some(event) = self.queue.remove(index) { @@ -336,7 +335,7 @@ impl Listener { // send NodeEvent to daemon main loop let (reply_tx, reply) = oneshot::channel(); let event = Event::Node { - dataflow_id: self.dataflow_id.clone(), + dataflow_id: self.dataflow_id, node_id: self.node_id.clone(), event, reply_sender: reply_tx, From 8e5a3303dac7b369c816ef7c99de58e669fc5b3f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 11:46:14 +0100 Subject: [PATCH 144/225] Fix: Actually spawn the TCP listener node in the daemon --- binaries/daemon/src/spawn.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index de016221..4a8fe635 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -173,6 +173,14 @@ async fn daemon_communication_config( .local_addr() .wrap_err("failed to get local addr of socket")?; + let event_loop_node_id = format!("{dataflow_id}/{node_id}"); + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::spawn(async move { + listener::tcp::listener_loop(socket, daemon_tx, shmem_handler_tx).await; + tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); + }); + Ok(DaemonCommunication::Tcp { socket_addr }) } DaemonCommunicationConfig::Shmem => { From 44d8ee0e6a7da6b1d171b8818c89ac6d2c584602 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 11:47:19 +0100 Subject: [PATCH 145/225] Default to safer TCP for daemon control channels for now --- libraries/core/src/daemon_messages.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/core/src/daemon_messages.rs b/libraries/core/src/daemon_messages.rs index c1bb4714..8807baa4 100644 --- a/libraries/core/src/daemon_messages.rs +++ b/libraries/core/src/daemon_messages.rs @@ -138,6 +138,6 @@ pub enum DaemonCommunicationConfig { impl Default for DaemonCommunicationConfig { fn default() -> Self { - Self::Shmem // TODO change to TCP + Self::Tcp } } From 556e2e2ec221b863f833a66a23b49f37385f2561 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 14:48:35 +0100 Subject: [PATCH 146/225] Unify daemon listener implementations to avoid code duplication --- Cargo.lock | 24 +- binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/listener/mod.rs | 408 ++++++++++++++++++++++++++ binaries/daemon/src/listener/shmem.rs | 331 ++++----------------- binaries/daemon/src/listener/tcp.rs | 353 ++-------------------- binaries/daemon/src/spawn.rs | 90 +----- examples/rust-dataflow/dataflow.yml | 2 + 7 files changed, 520 insertions(+), 689 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 696e6a19..8f807965 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -219,9 +219,9 @@ checksum = "30696a84d817107fc028e049980e09d5e140e8da8f1caeb17e8e950658a3cea9" [[package]] name = "async-trait" -version = "0.1.53" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed6aa3524a2dfcf9fe180c51eae2b58738348d819517ceadf95789c51fff7600" +checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" dependencies = [ "proc-macro2", "quote", @@ -716,7 +716,7 @@ dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", "lazy_static", - "memoffset", + "memoffset 0.6.5", "scopeguard", ] @@ -1008,6 +1008,7 @@ dependencies = [ name = "dora-daemon" version = "0.1.0" dependencies = [ + "async-trait", "bincode", "clap 3.2.20", "ctrlc", @@ -2061,6 +2062,15 @@ dependencies = [ "autocfg 1.1.0", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg 1.1.0", +] + [[package]] name = "mime" version = "0.3.16" @@ -2301,7 +2311,7 @@ dependencies = [ "cc", "cfg-if 1.0.0", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] @@ -2314,7 +2324,7 @@ dependencies = [ "cc", "cfg-if 1.0.0", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] @@ -2326,6 +2336,8 @@ dependencies = [ "bitflags", "cfg-if 1.0.0", "libc", + "memoffset 0.7.1", + "pin-utils", "static_assertions", ] @@ -3175,7 +3187,7 @@ checksum = "2a34bde3561f980a51c70495164200569a11662644fe5af017f0b5d7015688cc" dependencies = [ "cfg-if 0.1.10", "libc", - "nix 0.23.1", + "nix 0.26.2", "rand", "winapi", ] diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 67470957..0945a616 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -25,3 +25,4 @@ clap = { version = "3.1.8", features = ["derive"] } shared-memory-server = { path = "../../libraries/shared-memory-server" } ctrlc = "3.2.5" bincode = "1.3.3" +async-trait = "0.1.64" diff --git a/binaries/daemon/src/listener/mod.rs b/binaries/daemon/src/listener/mod.rs index 70fbb7d5..c0803853 100644 --- a/binaries/daemon/src/listener/mod.rs +++ b/binaries/daemon/src/listener/mod.rs @@ -1,2 +1,410 @@ +use crate::{shared_mem_handler, DaemonNodeEvent, Event}; +use dora_core::{ + config::NodeId, + daemon_messages::{ + DaemonCommunication, DaemonCommunicationConfig, DaemonReply, DaemonRequest, DataflowId, + DropEvent, NodeEvent, + }, +}; +use eyre::{eyre, Context}; +use shared_memory_server::{ShmemConf, ShmemServer}; +use std::{collections::VecDeque, net::Ipv4Addr}; +use tokio::{ + net::TcpListener, + sync::{mpsc, oneshot}, +}; + +// TODO unify and avoid duplication; pub mod shmem; pub mod tcp; + +pub async fn spawn_listener_loop( + dataflow_id: &DataflowId, + node_id: &NodeId, + daemon_tx: &mpsc::Sender, + shmem_handler_tx: &flume::Sender, + config: DaemonCommunicationConfig, +) -> eyre::Result { + match config { + DaemonCommunicationConfig::Tcp => { + let localhost = Ipv4Addr::new(127, 0, 0, 1); + let socket = match TcpListener::bind((localhost, 0)).await { + Ok(socket) => socket, + Err(err) => { + return Err( + eyre::Report::new(err).wrap_err("failed to create local TCP listener") + ) + } + }; + let socket_addr = socket + .local_addr() + .wrap_err("failed to get local addr of socket")?; + + let event_loop_node_id = format!("{dataflow_id}/{node_id}"); + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::spawn(async move { + tcp::listener_loop(socket, daemon_tx, shmem_handler_tx).await; + tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); + }); + + Ok(DaemonCommunication::Tcp { socket_addr }) + } + DaemonCommunicationConfig::Shmem => { + let daemon_control_region = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to allocate daemon_control_region")?; + let daemon_events_region = ShmemConf::new() + .size(4096) + .create() + .wrap_err("failed to allocate daemon_events_region")?; + let daemon_control_region_id = daemon_control_region.get_os_id().to_owned(); + let daemon_events_region_id = daemon_events_region.get_os_id().to_owned(); + + { + let server = unsafe { ShmemServer::new(daemon_control_region) } + .wrap_err("failed to create control server")?; + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::spawn(shmem::listener_loop(server, daemon_tx, shmem_handler_tx)); + } + + { + let server = unsafe { ShmemServer::new(daemon_events_region) } + .wrap_err("failed to create events server")?; + let event_loop_node_id = format!("{dataflow_id}/{node_id}"); + let daemon_tx = daemon_tx.clone(); + let shmem_handler_tx = shmem_handler_tx.clone(); + tokio::task::spawn(async move { + shmem::listener_loop(server, daemon_tx, shmem_handler_tx).await; + tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); + }); + } + + Ok(DaemonCommunication::Shmem { + daemon_control_region_id, + daemon_events_region_id, + }) + } + } +} + +struct Listener { + dataflow_id: DataflowId, + node_id: NodeId, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, + subscribed_events: Option>, + max_queue_len: usize, + queue: VecDeque, + connection: C, +} + +impl Listener +where + C: Connection, +{ + pub(crate) async fn run( + mut connection: C, + daemon_tx: mpsc::Sender, + shmem_handler_tx: flume::Sender, + ) { + // receive the first message + let message = match connection + .receive_message() + .await + .wrap_err("failed to receive register message") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!("channel disconnected before register message"); + return; + } // disconnected + Err(err) => { + tracing::info!("{err:?}"); + return; + } + }; + + match message { + DaemonRequest::Register { + dataflow_id, + node_id, + } => { + let reply = DaemonReply::Result(Ok(())); + match connection + .send_reply(reply) + .await + .wrap_err("failed to send register reply") + { + Ok(()) => { + let mut listener = Listener { + dataflow_id, + node_id, + connection, + daemon_tx, + shmem_handler_tx, + subscribed_events: None, + max_queue_len: 10, // TODO: make this configurable + queue: VecDeque::new(), + }; + match listener.run_inner().await.wrap_err("listener failed") { + Ok(()) => {} + Err(err) => tracing::error!("{err:?}"), + } + } + Err(err) => { + tracing::warn!("{err:?}"); + } + } + } + other => { + tracing::warn!("expected register message, got `{other:?}`"); + let reply = DaemonReply::Result(Err("must send register message first".into())); + if let Err(err) = connection + .send_reply(reply) + .await + .wrap_err("failed to send reply") + { + tracing::warn!("{err:?}"); + } + } + } + } + + async fn run_inner(&mut self) -> eyre::Result<()> { + loop { + // receive the next node message + let message = match self + .connection + .receive_message() + .await + .wrap_err("failed to receive DaemonRequest") + { + Ok(Some(m)) => m, + Ok(None) => { + tracing::info!( + "channel disconnected: {}/{}", + self.dataflow_id, + self.node_id + ); + break; + } // disconnected + Err(err) => { + tracing::warn!("{err:?}"); + continue; + } + }; + + // handle incoming events + self.handle_events().await?; + + self.handle_message(message).await?; + } + Ok(()) + } + + async fn handle_events(&mut self) -> eyre::Result<()> { + if let Some(events) = &mut self.subscribed_events { + while let Ok(event) = events.try_recv() { + self.queue.push_back(event); + } + + // drop oldest input events to maintain max queue length queue + let input_event_count = self + .queue + .iter() + .filter(|e| matches!(e, NodeEvent::Input { .. })) + .count(); + let drop_n = input_event_count.saturating_sub(self.max_queue_len); + self.drop_oldest_inputs(drop_n).await?; + } + Ok(()) + } + + async fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { + let mut drop_tokens = Vec::new(); + for i in 0..number { + // find index of oldest input event + let index = self + .queue + .iter() + .position(|e| matches!(e, NodeEvent::Input { .. })) + .unwrap_or_else(|| panic!("no input event found in drop iteration {i}")); + + // remove that event + if let Some(event) = self.queue.remove(index) { + if let NodeEvent::Input { + data: Some(data), .. + } = event + { + drop_tokens.push(data.drop_token); + } + } + } + self.report_drop_tokens(drop_tokens).await?; + Ok(()) + } + + #[tracing::instrument(skip(self), fields(%self.dataflow_id, %self.node_id))] + async fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { + match message { + DaemonRequest::Register { .. } => { + let reply = DaemonReply::Result(Err("unexpected register message".into())); + self.send_reply(reply).await?; + } + DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped).await?, + DaemonRequest::CloseOutputs(outputs) => { + self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs)) + .await? + } + DaemonRequest::PrepareOutputMessage { + output_id, + metadata, + data_len, + } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data_len, + reply_sender, + }; + self.send_shared_memory_event(event).await?; + let reply = reply + .await + .wrap_err("failed to receive prepare output reply")?; + // tracing::debug!("prepare latency: {:?}", start.elapsed()?); + self.send_reply(reply).await?; + } + DaemonRequest::SendPreparedMessage { id } => { + let (reply_sender, reply) = oneshot::channel(); + let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; + self.send_shared_memory_event(event).await?; + self.send_reply( + reply + .await + .wrap_err("failed to receive send output reply")?, + ) + .await?; + } + DaemonRequest::SendEmptyMessage { + output_id, + metadata, + } => { + // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; + // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); + let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + output_id, + metadata, + data: None, + }); + let result = self + .send_daemon_event(event) + .await + .map_err(|_| "failed to receive send_empty_message reply".to_owned()); + self.send_reply(DaemonReply::Result(result)).await?; + } + DaemonRequest::Subscribe => { + let (tx, rx) = flume::bounded(100); + self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx }) + .await?; + self.subscribed_events = Some(rx); + } + DaemonRequest::NextEvent { drop_tokens } => { + self.report_drop_tokens(drop_tokens).await?; + + // try to take the latest queued event first + let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); + let reply = match queued_event { + Some(reply) => reply, + None => { + match self.subscribed_events.as_mut() { + // wait for next event + Some(events) => match events.recv_async().await { + Ok(event) => DaemonReply::NodeEvent(event), + Err(flume::RecvError::Disconnected) => DaemonReply::Closed, + }, + None => DaemonReply::Result(Err( + "Ignoring event request because no subscribe \ + message was sent yet" + .into(), + )), + } + } + }; + + self.send_reply(reply).await?; + } + } + Ok(()) + } + + async fn report_drop_tokens( + &mut self, + drop_tokens: Vec, + ) -> eyre::Result<()> { + if !drop_tokens.is_empty() { + let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { + tokens: drop_tokens, + }); + self.send_shared_memory_event(drop_event).await?; + } + Ok(()) + } + + async fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { + // send NodeEvent to daemon main loop + let (reply_tx, reply) = oneshot::channel(); + let event = Event::Node { + dataflow_id: self.dataflow_id, + node_id: self.node_id.clone(), + event, + reply_sender: reply_tx, + }; + self.daemon_tx + .send(event) + .await + .map_err(|_| eyre!("failed to send event to daemon"))?; + let reply = reply + .await + .map_err(|_| eyre!("failed to receive reply from daemon"))?; + self.send_reply(reply).await?; + Ok(()) + } + + async fn send_reply(&mut self, reply: DaemonReply) -> eyre::Result<()> { + self.connection + .send_reply(reply) + .await + .wrap_err("failed to send reply to node") + } + + async fn send_shared_memory_event( + &self, + event: shared_mem_handler::NodeEvent, + ) -> eyre::Result<()> { + self.shmem_handler_tx + .send_async(event) + .await + .map_err(|_| eyre!("failed to send event to shared_mem_handler")) + } + + async fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { + self.daemon_tx + .send(event) + .await + .map_err(|_| eyre!("failed to send event to daemon")) + } +} + +#[async_trait::async_trait] +trait Connection { + async fn receive_message(&mut self) -> eyre::Result>; + async fn send_reply(&mut self, message: DaemonReply) -> eyre::Result<()>; +} diff --git a/binaries/daemon/src/listener/shmem.rs b/binaries/daemon/src/listener/shmem.rs index 9e12a2de..c05f1d86 100644 --- a/binaries/daemon/src/listener/shmem.rs +++ b/binaries/daemon/src/listener/shmem.rs @@ -1,298 +1,75 @@ -use std::collections::VecDeque; - -use crate::{shared_mem_handler, DaemonNodeEvent, Event}; -use dora_core::{ - config::NodeId, - daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, -}; -use eyre::{eyre, Context}; +use super::Listener; +use crate::{shared_mem_handler, Event}; +use dora_core::daemon_messages::{DaemonReply, DaemonRequest}; +use eyre::eyre; use shared_memory_server::ShmemServer; use tokio::sync::{mpsc, oneshot}; #[tracing::instrument(skip(server, daemon_tx, shmem_handler_tx))] -pub fn listener_loop( +pub async fn listener_loop( mut server: ShmemServer, daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, ) { - // receive the first message - let message = match server - .listen() - .wrap_err("failed to receive register message") - { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!("channel disconnected before register message"); - return; - } // disconnected - Err(err) => { - tracing::info!("{err:?}"); - return; - } - }; - - match message { - DaemonRequest::Register { - dataflow_id, - node_id, - } => { - let reply = DaemonReply::Result(Ok(())); - match server - .send_reply(&reply) - .wrap_err("failed to send register reply") - { - Ok(()) => { - let mut listener = Listener { - dataflow_id, - node_id, - server, - daemon_tx, - shmem_handler_tx, - subscribed_events: None, - max_queue_len: 10, // TODO: make this configurable - queue: VecDeque::new(), - }; - match listener.run().wrap_err("listener failed") { - Ok(()) => {} - Err(err) => tracing::error!("{err:?}"), + let (tx, rx) = flume::bounded(0); + tokio::task::spawn_blocking(move || { + while let Ok(operation) = rx.recv() { + match operation { + Operation::Receive(sender) => { + if sender.send(server.listen()).is_err() { + break; } } - Err(err) => { - tracing::warn!("{err:?}"); + Operation::Send { + message, + result_sender, + } => { + let result = server.send_reply(&message); + if result_sender.send(result).is_err() { + break; + } } } } - _ => { - let reply = DaemonReply::Result(Err("must send register message first".into())); - if let Err(err) = server.send_reply(&reply).wrap_err("failed to send reply") { - tracing::warn!("{err:?}"); - } - } - } + }); + let connection = ShmemConnection(tx); + Listener::run(connection, daemon_tx, shmem_handler_tx).await } -struct Listener { - dataflow_id: DataflowId, - node_id: NodeId, - server: ShmemServer, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, - subscribed_events: Option>, - max_queue_len: usize, - queue: VecDeque, +enum Operation { + Receive(oneshot::Sender>>), + Send { + message: DaemonReply, + result_sender: oneshot::Sender>, + }, } -impl Listener { - fn run(&mut self) -> eyre::Result<()> { - loop { - // receive the next node message - let message = match self - .server - .listen() - .wrap_err("failed to receive DaemonRequest") - { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!( - "channel disconnected: {}/{}", - self.dataflow_id, - self.node_id - ); - break; - } // disconnected - Err(err) => { - tracing::warn!("{err:?}"); - continue; - } - }; - - // handle incoming events - self.handle_events()?; - - self.handle_message(message)?; - } - Ok(()) - } - - fn handle_events(&mut self) -> eyre::Result<()> { - if let Some(events) = &mut self.subscribed_events { - while let Ok(event) = events.try_recv() { - self.queue.push_back(event); - } - - // drop oldest input events to maintain max queue length queue - let input_event_count = self - .queue - .iter() - .filter(|e| matches!(e, NodeEvent::Input { .. })) - .count(); - let drop_n = input_event_count.saturating_sub(self.max_queue_len); - self.drop_oldest_inputs(drop_n)?; - } - Ok(()) - } - - fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { - let mut drop_tokens = Vec::new(); - for i in 0..number { - // find index of oldest input event - let index = self - .queue - .iter() - .position(|e| matches!(e, NodeEvent::Input { .. })) - .expect(&format!("no input event found in drop iteration {i}")); - - // remove that event - if let Some(event) = self.queue.remove(index) { - if let NodeEvent::Input { - data: Some(data), .. - } = event - { - drop_tokens.push(data.drop_token); - } - } - } - self.report_drop_tokens(drop_tokens)?; - Ok(()) - } - - fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { - match message { - DaemonRequest::Register { .. } => { - let reply = DaemonReply::Result(Err("unexpected register message".into())); - self.send_reply(&reply)?; - } - DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped)?, - DaemonRequest::CloseOutputs(outputs) => { - self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs))? - } - DaemonRequest::PrepareOutputMessage { - output_id, - metadata, - data_len, - } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - output_id, - metadata, - data_len, - reply_sender, - }; - self.send_shared_memory_event(event)?; - let reply = reply - .blocking_recv() - .wrap_err("failed to receive prepare output reply")?; - // tracing::debug!("prepare latency: {:?}", start.elapsed()?); - self.send_reply(&reply)?; - } - DaemonRequest::SendPreparedMessage { id } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; - self.send_shared_memory_event(event)?; - self.send_reply( - &reply - .blocking_recv() - .wrap_err("failed to receive send output reply")?, - )?; - } - DaemonRequest::SendEmptyMessage { - output_id, - metadata, - } => { - // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; - // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); - let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - output_id, - metadata, - data: None, - }); - let result = self - .send_daemon_event(event) - .map_err(|_| "failed to receive send_empty_message reply".to_owned()); - self.send_reply(&DaemonReply::Result(result))?; - } - DaemonRequest::Subscribe => { - let (tx, rx) = flume::bounded(100); - self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx })?; - self.subscribed_events = Some(rx); - } - DaemonRequest::NextEvent { drop_tokens } => { - self.report_drop_tokens(drop_tokens)?; - - // try to take the latest queued event first - let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); - let reply = queued_event.unwrap_or_else(|| { - match self.subscribed_events.as_mut() { - // wait for next event - Some(events) => match events.recv() { - Ok(event) => DaemonReply::NodeEvent(event), - Err(flume::RecvError::Disconnected) => DaemonReply::Closed, - }, - None => { - DaemonReply::Result(Err("Ignoring event request because no subscribe \ - message was sent yet" - .into())) - } - } - }); - - self.send_reply(&reply)?; - } - } - Ok(()) - } - - fn report_drop_tokens( - &mut self, - drop_tokens: Vec, - ) -> eyre::Result<()> { - if !drop_tokens.is_empty() { - let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { - tokens: drop_tokens, - }); - self.send_shared_memory_event(drop_event)?; - } - Ok(()) - } - - fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { - // send NodeEvent to daemon main loop - let (reply_tx, reply) = oneshot::channel(); - let event = Event::Node { - dataflow_id: self.dataflow_id.clone(), - node_id: self.node_id.clone(), - event, - reply_sender: reply_tx, - }; - self.daemon_tx - .blocking_send(event) - .map_err(|_| eyre!("failed to send event to daemon"))?; - let reply = reply - .blocking_recv() - .map_err(|_| eyre!("failed to receive reply from daemon"))?; - self.send_reply(&reply)?; - Ok(()) - } - - fn send_reply(&mut self, reply: &DaemonReply) -> eyre::Result<()> { - self.server - .send_reply(&reply) - .wrap_err("failed to send reply to node") - } - - fn send_shared_memory_event(&self, event: shared_mem_handler::NodeEvent) -> eyre::Result<()> { - self.shmem_handler_tx - .send(event) - .map_err(|_| eyre!("failed to send event to shared_mem_handler")) +struct ShmemConnection(flume::Sender); + +#[async_trait::async_trait] +impl super::Connection for ShmemConnection { + async fn receive_message(&mut self) -> eyre::Result> { + let (tx, rx) = oneshot::channel(); + self.0 + .send_async(Operation::Receive(tx)) + .await + .map_err(|_| eyre!("failed send receive request to ShmemServer"))?; + rx.await + .map_err(|_| eyre!("failed to receive from ShmemServer")) + .and_then(|r| r) } - fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { - self.daemon_tx - .blocking_send(event) - .map_err(|_| eyre!("failed to send event to daemon")) + async fn send_reply(&mut self, reply: DaemonReply) -> eyre::Result<()> { + let (tx, rx) = oneshot::channel(); + self.0 + .send_async(Operation::Send { + message: reply, + result_sender: tx, + }) + .await + .map_err(|_| eyre!("failed send send request to ShmemServer"))?; + rx.await + .map_err(|_| eyre!("failed to receive from ShmemServer")) + .and_then(|r| r) } } diff --git a/binaries/daemon/src/listener/tcp.rs b/binaries/daemon/src/listener/tcp.rs index c3ca824f..cc216cca 100644 --- a/binaries/daemon/src/listener/tcp.rs +++ b/binaries/daemon/src/listener/tcp.rs @@ -1,17 +1,14 @@ +use super::Listener; use crate::{ shared_mem_handler, tcp_utils::{tcp_receive, tcp_send}, - DaemonNodeEvent, Event, + Event, }; -use dora_core::{ - config::NodeId, - daemon_messages::{DaemonReply, DaemonRequest, DataflowId, DropEvent, NodeEvent}, -}; -use eyre::{eyre, Context}; -use std::collections::VecDeque; +use dora_core::daemon_messages::{DaemonReply, DaemonRequest}; +use eyre::Context; use tokio::{ net::{TcpListener, TcpStream}, - sync::{mpsc, oneshot}, + sync::mpsc, }; #[tracing::instrument(skip(listener, daemon_tx, shmem_handler_tx))] @@ -41,8 +38,8 @@ pub async fn listener_loop( } #[tracing::instrument(skip(connection, daemon_tx, shmem_handler_tx))] -pub async fn handle_connection_loop( - mut connection: TcpStream, +async fn handle_connection_loop( + connection: TcpStream, daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, ) { @@ -50,327 +47,37 @@ pub async fn handle_connection_loop( tracing::warn!("failed to set nodelay for connection: {err}"); } - // receive the first message - let message = match receive_message(&mut connection) - .await - .wrap_err("failed to receive register message") - { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!("channel disconnected before register message"); - return; - } // disconnected - Err(err) => { - tracing::info!("{err:?}"); - return; - } - }; - - match message { - DaemonRequest::Register { - dataflow_id, - node_id, - } => { - let reply = DaemonReply::Result(Ok(())); - match send_reply(&mut connection, &reply) - .await - .wrap_err("failed to send register reply") - { - Ok(()) => { - let mut listener = Listener { - dataflow_id, - node_id, - connection, - daemon_tx, - shmem_handler_tx, - subscribed_events: None, - max_queue_len: 10, // TODO: make this configurable - queue: VecDeque::new(), - }; - match listener.run().await.wrap_err("listener failed") { - Ok(()) => {} - Err(err) => tracing::error!("{err:?}"), - } - } - Err(err) => { - tracing::warn!("{err:?}"); - } - } - } - other => { - tracing::warn!("expected register message, got `{other:?}`"); - let reply = DaemonReply::Result(Err("must send register message first".into())); - if let Err(err) = send_reply(&mut connection, &reply) - .await - .wrap_err("failed to send reply") - { - tracing::warn!("{err:?}"); - } - } - } -} - -async fn receive_message(connection: &mut TcpStream) -> eyre::Result> { - let raw = match tcp_receive(connection).await { - Ok(raw) => raw, - Err(err) => match err.kind() { - std::io::ErrorKind::UnexpectedEof | std::io::ErrorKind::ConnectionAborted => { - return Ok(None) - } - _other => { - return Err(err) - .context("unexpected I/O error while trying to receive DaemonRequest") - } - }, - }; - bincode::deserialize(&raw) - .wrap_err("failed to deserialize DaemonRequest") - .map(Some) + Listener::run(TcpConnection(connection), daemon_tx, shmem_handler_tx).await } -async fn send_reply(connection: &mut TcpStream, message: &DaemonReply) -> eyre::Result<()> { - let serialized = bincode::serialize(&message).wrap_err("failed to serialize DaemonReply")?; - tcp_send(connection, &serialized) - .await - .wrap_err("failed to send DaemonReply")?; - Ok(()) -} +struct TcpConnection(TcpStream); -struct Listener { - dataflow_id: DataflowId, - node_id: NodeId, - connection: TcpStream, - daemon_tx: mpsc::Sender, - shmem_handler_tx: flume::Sender, - subscribed_events: Option>, - max_queue_len: usize, - queue: VecDeque, -} - -impl Listener { - async fn run(&mut self) -> eyre::Result<()> { - loop { - // receive the next node message - let message = match receive_message(&mut self.connection) - .await - .wrap_err("failed to receive DaemonRequest") - { - Ok(Some(m)) => m, - Ok(None) => { - tracing::info!( - "channel disconnected: {}/{}", - self.dataflow_id, - self.node_id - ); - break; - } // disconnected - Err(err) => { - tracing::warn!("{err:?}"); - continue; +#[async_trait::async_trait] +impl super::Connection for TcpConnection { + async fn receive_message(&mut self) -> eyre::Result> { + let raw = match tcp_receive(&mut self.0).await { + Ok(raw) => raw, + Err(err) => match err.kind() { + std::io::ErrorKind::UnexpectedEof | std::io::ErrorKind::ConnectionAborted => { + return Ok(None) } - }; - - // handle incoming events - self.handle_events().await?; - - self.handle_message(message).await?; - } - Ok(()) - } - - async fn handle_events(&mut self) -> eyre::Result<()> { - if let Some(events) = &mut self.subscribed_events { - while let Ok(event) = events.try_recv() { - self.queue.push_back(event); - } - - // drop oldest input events to maintain max queue length queue - let input_event_count = self - .queue - .iter() - .filter(|e| matches!(e, NodeEvent::Input { .. })) - .count(); - let drop_n = input_event_count.saturating_sub(self.max_queue_len); - self.drop_oldest_inputs(drop_n).await?; - } - Ok(()) - } - - async fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { - let mut drop_tokens = Vec::new(); - for i in 0..number { - // find index of oldest input event - let index = self - .queue - .iter() - .position(|e| matches!(e, NodeEvent::Input { .. })) - .unwrap_or_else(|| panic!("no input event found in drop iteration {i}")); - - // remove that event - if let Some(event) = self.queue.remove(index) { - if let NodeEvent::Input { - data: Some(data), .. - } = event - { - drop_tokens.push(data.drop_token); + _other => { + return Err(err) + .context("unexpected I/O error while trying to receive DaemonRequest") } - } - } - self.report_drop_tokens(drop_tokens).await?; - Ok(()) - } - - #[tracing::instrument(skip(self), fields(%self.dataflow_id, %self.node_id))] - async fn handle_message(&mut self, message: DaemonRequest) -> eyre::Result<()> { - match message { - DaemonRequest::Register { .. } => { - let reply = DaemonReply::Result(Err("unexpected register message".into())); - self.send_reply(&reply).await?; - } - DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped).await?, - DaemonRequest::CloseOutputs(outputs) => { - self.process_daemon_event(DaemonNodeEvent::CloseOutputs(outputs)) - .await? - } - DaemonRequest::PrepareOutputMessage { - output_id, - metadata, - data_len, - } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::PrepareOutputMessage { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - output_id, - metadata, - data_len, - reply_sender, - }; - self.send_shared_memory_event(event).await?; - let reply = reply - .await - .wrap_err("failed to receive prepare output reply")?; - // tracing::debug!("prepare latency: {:?}", start.elapsed()?); - self.send_reply(&reply).await?; - } - DaemonRequest::SendPreparedMessage { id } => { - let (reply_sender, reply) = oneshot::channel(); - let event = shared_mem_handler::NodeEvent::SendPreparedMessage { id, reply_sender }; - self.send_shared_memory_event(event).await?; - self.send_reply( - &reply - .await - .wrap_err("failed to receive send output reply")?, - ) - .await?; - } - DaemonRequest::SendEmptyMessage { - output_id, - metadata, - } => { - // let elapsed = metadata.timestamp().get_time().to_system_time().elapsed()?; - // tracing::debug!("listener SendEmptyMessage: {elapsed:?}"); - let event = crate::Event::ShmemHandler(crate::ShmemHandlerEvent::SendOut { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - output_id, - metadata, - data: None, - }); - let result = self - .send_daemon_event(event) - .await - .map_err(|_| "failed to receive send_empty_message reply".to_owned()); - self.send_reply(&DaemonReply::Result(result)).await?; - } - DaemonRequest::Subscribe => { - let (tx, rx) = flume::bounded(100); - self.process_daemon_event(DaemonNodeEvent::Subscribe { event_sender: tx }) - .await?; - self.subscribed_events = Some(rx); - } - DaemonRequest::NextEvent { drop_tokens } => { - self.report_drop_tokens(drop_tokens).await?; - - // try to take the latest queued event first - let queued_event = self.queue.pop_front().map(DaemonReply::NodeEvent); - let reply = match queued_event { - Some(reply) => reply, - None => { - match self.subscribed_events.as_mut() { - // wait for next event - Some(events) => match events.recv_async().await { - Ok(event) => DaemonReply::NodeEvent(event), - Err(flume::RecvError::Disconnected) => DaemonReply::Closed, - }, - None => DaemonReply::Result(Err( - "Ignoring event request because no subscribe \ - message was sent yet" - .into(), - )), - } - } - }; - - self.send_reply(&reply).await?; - } - } - Ok(()) - } - - async fn report_drop_tokens( - &mut self, - drop_tokens: Vec, - ) -> eyre::Result<()> { - if !drop_tokens.is_empty() { - let drop_event = shared_mem_handler::NodeEvent::Drop(DropEvent { - tokens: drop_tokens, - }); - self.send_shared_memory_event(drop_event).await?; - } - Ok(()) - } - - async fn process_daemon_event(&mut self, event: DaemonNodeEvent) -> eyre::Result<()> { - // send NodeEvent to daemon main loop - let (reply_tx, reply) = oneshot::channel(); - let event = Event::Node { - dataflow_id: self.dataflow_id, - node_id: self.node_id.clone(), - event, - reply_sender: reply_tx, + }, }; - self.daemon_tx - .send(event) - .await - .map_err(|_| eyre!("failed to send event to daemon"))?; - let reply = reply - .await - .map_err(|_| eyre!("failed to receive reply from daemon"))?; - self.send_reply(&reply).await?; - Ok(()) - } - - async fn send_reply(&mut self, reply: &DaemonReply) -> eyre::Result<()> { - send_reply(&mut self.connection, reply) - .await - .wrap_err("failed to send reply to node") + bincode::deserialize(&raw) + .wrap_err("failed to deserialize DaemonRequest") + .map(Some) } - async fn send_shared_memory_event( - &self, - event: shared_mem_handler::NodeEvent, - ) -> eyre::Result<()> { - self.shmem_handler_tx - .send_async(event) + async fn send_reply(&mut self, message: DaemonReply) -> eyre::Result<()> { + let serialized = + bincode::serialize(&message).wrap_err("failed to serialize DaemonReply")?; + tcp_send(&mut self.0, &serialized) .await - .map_err(|_| eyre!("failed to send event to shared_mem_handler")) - } - - async fn send_daemon_event(&self, event: crate::Event) -> eyre::Result<()> { - self.daemon_tx - .send(event) - .await - .map_err(|_| eyre!("failed to send event to daemon")) + .wrap_err("failed to send DaemonReply")?; + Ok(()) } } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 4a8fe635..d2c7170e 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,18 +1,16 @@ use crate::{ - listener, runtime_node_inputs, runtime_node_outputs, shared_mem_handler, DoraEvent, Event, + listener::spawn_listener_loop, runtime_node_inputs, runtime_node_outputs, shared_mem_handler, + DoraEvent, Event, }; use dora_core::{ - config::{NodeId, NodeRunConfig}, - daemon_messages::{ - DaemonCommunication, DaemonCommunicationConfig, DataflowId, NodeConfig, RuntimeConfig, - }, + config::NodeRunConfig, + daemon_messages::{DaemonCommunicationConfig, DataflowId, NodeConfig, RuntimeConfig}, descriptor::{resolve_path, source_is_url, OperatorSource, ResolvedNode}, }; use dora_download::download_file; use eyre::{eyre, WrapErr}; -use shared_memory_server::{ShmemConf, ShmemServer}; -use std::{env::consts::EXE_EXTENSION, net::Ipv4Addr, path::Path, process::Stdio}; -use tokio::{net::TcpListener, sync::mpsc}; +use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; +use tokio::sync::mpsc; pub async fn spawn_node( dataflow_id: DataflowId, @@ -25,7 +23,7 @@ pub async fn spawn_node( let node_id = node.id.clone(); tracing::debug!("Spawning node `{dataflow_id}/{node_id}`"); - let daemon_communication = daemon_communication_config( + let daemon_communication = spawn_listener_loop( &dataflow_id, &node_id, &daemon_tx, @@ -150,77 +148,3 @@ pub async fn spawn_node( }); Ok(()) } - -async fn daemon_communication_config( - dataflow_id: &DataflowId, - node_id: &NodeId, - daemon_tx: &mpsc::Sender, - shmem_handler_tx: &flume::Sender, - config: DaemonCommunicationConfig, -) -> eyre::Result { - match config { - DaemonCommunicationConfig::Tcp => { - let localhost = Ipv4Addr::new(127, 0, 0, 1); - let socket = match TcpListener::bind((localhost, 0)).await { - Ok(socket) => socket, - Err(err) => { - return Err( - eyre::Report::new(err).wrap_err("failed to create local TCP listener") - ) - } - }; - let socket_addr = socket - .local_addr() - .wrap_err("failed to get local addr of socket")?; - - let event_loop_node_id = format!("{dataflow_id}/{node_id}"); - let daemon_tx = daemon_tx.clone(); - let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::spawn(async move { - listener::tcp::listener_loop(socket, daemon_tx, shmem_handler_tx).await; - tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); - }); - - Ok(DaemonCommunication::Tcp { socket_addr }) - } - DaemonCommunicationConfig::Shmem => { - let daemon_control_region = ShmemConf::new() - .size(4096) - .create() - .wrap_err("failed to allocate daemon_control_region")?; - let daemon_events_region = ShmemConf::new() - .size(4096) - .create() - .wrap_err("failed to allocate daemon_events_region")?; - let daemon_control_region_id = daemon_control_region.get_os_id().to_owned(); - let daemon_events_region_id = daemon_events_region.get_os_id().to_owned(); - - { - let server = unsafe { ShmemServer::new(daemon_control_region) } - .wrap_err("failed to create control server")?; - let daemon_tx = daemon_tx.clone(); - let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::task::spawn_blocking(move || { - listener::shmem::listener_loop(server, daemon_tx, shmem_handler_tx) - }); - } - - { - let server = unsafe { ShmemServer::new(daemon_events_region) } - .wrap_err("failed to create events server")?; - let event_loop_node_id = format!("{dataflow_id}/{node_id}"); - let daemon_tx = daemon_tx.clone(); - let shmem_handler_tx = shmem_handler_tx.clone(); - tokio::task::spawn_blocking(move || { - listener::shmem::listener_loop(server, daemon_tx, shmem_handler_tx); - tracing::debug!("event listener loop finished for `{event_loop_node_id}`"); - }); - } - - Ok(DaemonCommunication::Shmem { - daemon_control_region_id, - daemon_events_region_id, - }) - } - } -} diff --git a/examples/rust-dataflow/dataflow.yml b/examples/rust-dataflow/dataflow.yml index 110dc327..d3777d97 100644 --- a/examples/rust-dataflow/dataflow.yml +++ b/examples/rust-dataflow/dataflow.yml @@ -2,6 +2,8 @@ communication: zenoh: prefix: /example-rust-dataflow +daemon_config: Tcp # or Shmem + nodes: - id: rust-node custom: From a6b3bbdf3c79e4347a0702bc1ee910c87a14d638 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 15:08:46 +0100 Subject: [PATCH 147/225] Add python-operator-dataflow example --- examples/python-operator-dataflow/.gitignore | 1 + examples/python-operator-dataflow/README.md | 33 +++++++ .../python-operator-dataflow/dataflow.yml | 27 ++++++ .../dataflow_without_webcam.yml | 27 ++++++ .../python-operator-dataflow/no_webcam.py | 33 +++++++ .../object_detection.py | 42 +++++++++ examples/python-operator-dataflow/plot.py | 86 +++++++++++++++++++ .../python-operator-dataflow/requirements.txt | 45 ++++++++++ examples/python-operator-dataflow/run.rs | 46 ++++++++++ examples/python-operator-dataflow/run.sh | 15 ++++ examples/python-operator-dataflow/utils.py | 82 ++++++++++++++++++ examples/python-operator-dataflow/webcam.py | 31 +++++++ 12 files changed, 468 insertions(+) create mode 100644 examples/python-operator-dataflow/.gitignore create mode 100644 examples/python-operator-dataflow/README.md create mode 100644 examples/python-operator-dataflow/dataflow.yml create mode 100644 examples/python-operator-dataflow/dataflow_without_webcam.yml create mode 100755 examples/python-operator-dataflow/no_webcam.py create mode 100755 examples/python-operator-dataflow/object_detection.py create mode 100755 examples/python-operator-dataflow/plot.py create mode 100644 examples/python-operator-dataflow/requirements.txt create mode 100644 examples/python-operator-dataflow/run.rs create mode 100644 examples/python-operator-dataflow/run.sh create mode 100644 examples/python-operator-dataflow/utils.py create mode 100755 examples/python-operator-dataflow/webcam.py diff --git a/examples/python-operator-dataflow/.gitignore b/examples/python-operator-dataflow/.gitignore new file mode 100644 index 00000000..eede66d8 --- /dev/null +++ b/examples/python-operator-dataflow/.gitignore @@ -0,0 +1 @@ +*.pt \ No newline at end of file diff --git a/examples/python-operator-dataflow/README.md b/examples/python-operator-dataflow/README.md new file mode 100644 index 00000000..815a6f53 --- /dev/null +++ b/examples/python-operator-dataflow/README.md @@ -0,0 +1,33 @@ +# Python Dataflow Example + +This examples shows how to create and connect dora operators and custom nodes in Python. + +## Overview + +The [`dataflow.yml`](./dataflow.yml) defines a simple dataflow graph with the following three nodes: + +- a webcam node, that connects to your webcam and feed the dataflow with webcam frame as jpeg compressed bytearray. +- an object detection node, that apply Yolo v5 on the webcam image. The model is imported from Pytorch Hub. The output is the bouding box of each object detected, the confidence and the class. You can have more info here: https://pytorch.org/hub/ultralytics_yolov5/ +- a window plotting node, that will retrieve the webcam image and the Yolov5 bounding box and join the two together. + +## Getting started + +```bash +cargo run --example python-dataflow +``` + +## Installation + +To install, you should run the `install.sh` script. + +```bash +install.sh +``` + +## Run the dataflow as a standalone + +- Start the `dora-coordinator`, passing the paths to the dataflow file and the `dora-runtime` as arguments: + +``` +../../target/release/dora-coordinator --run-dataflow dataflow.yml ../../target/release/dora-runtime +``` diff --git a/examples/python-operator-dataflow/dataflow.yml b/examples/python-operator-dataflow/dataflow.yml new file mode 100644 index 00000000..86ac1422 --- /dev/null +++ b/examples/python-operator-dataflow/dataflow.yml @@ -0,0 +1,27 @@ +communication: + zenoh: + prefix: /example-python-dataflow + +nodes: + - id: webcam + custom: + source: webcam.py + inputs: + tick: dora/timer/millis/100 + outputs: + - image + + - id: object_detection + operator: + python: object_detection.py + inputs: + image: webcam/image + outputs: + - bbox + + - id: plot + operator: + python: plot.py + inputs: + image: webcam/image + bbox: object_detection/bbox diff --git a/examples/python-operator-dataflow/dataflow_without_webcam.yml b/examples/python-operator-dataflow/dataflow_without_webcam.yml new file mode 100644 index 00000000..6b9a00af --- /dev/null +++ b/examples/python-operator-dataflow/dataflow_without_webcam.yml @@ -0,0 +1,27 @@ +communication: + zenoh: + prefix: /example-python-no-webcam-dataflow + +nodes: + - id: no_webcam + custom: + source: ./no_webcam.py + inputs: + tick: dora/timer/millis/100 + outputs: + - image + + - id: object_detection + operator: + python: object_detection.py + inputs: + image: no_webcam/image + outputs: + - bbox + + - id: plot + operator: + python: plot.py + inputs: + image: no_webcam/image + bbox: object_detection/bbox diff --git a/examples/python-operator-dataflow/no_webcam.py b/examples/python-operator-dataflow/no_webcam.py new file mode 100755 index 00000000..3c322c24 --- /dev/null +++ b/examples/python-operator-dataflow/no_webcam.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import time +import urllib.request + +import cv2 +import numpy as np +from dora import Node + +print("Hello from no_webcam.py") + + +req = urllib.request.urlopen("https://ultralytics.com/images/zidane.jpg") + +arr = np.asarray(bytearray(req.read()), dtype=np.uint8) +node = Node() + +start = time.time() + +while time.time() - start < 20: + # Wait next dora_input + event = node.next() + match event["type"]: + case "INPUT": + print("received input", event["id"]) + node.send_output("image", arr.tobytes()) + case "STOP": + print("received stop") + case other: + print("received unexpected event:", other) + + time.sleep(1) diff --git a/examples/python-operator-dataflow/object_detection.py b/examples/python-operator-dataflow/object_detection.py new file mode 100755 index 00000000..098ec4d1 --- /dev/null +++ b/examples/python-operator-dataflow/object_detection.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from enum import Enum +from typing import Callable + +import cv2 +import numpy as np +import torch + + +class DoraStatus(Enum): + CONTINUE = 0 + STOP = 1 + + +class Operator: + """ + Infering object from images + """ + + def __init__(self): + self.model = torch.hub.load("ultralytics/yolov5", "yolov5n") + + def on_input( + self, + dora_input: dict, + send_output: Callable[[str, bytes], None], + ) -> DoraStatus: + """Handle image + Args: + dora_input (dict): Dict containing the "id", "data", and "metadata" + send_output (Callable[[str, bytes]]): Function enabling sending output back to dora. + """ + + frame = np.frombuffer(dora_input["data"], dtype="uint8") + frame = cv2.imdecode(frame, -1) + frame = frame[:, :, ::-1] # OpenCV image (BGR to RGB) + results = self.model(frame) # includes NMS + arrays = np.array(results.xyxy[0].cpu()).tobytes() + send_output("bbox", arrays, dora_input["metadata"]) + return DoraStatus.CONTINUE diff --git a/examples/python-operator-dataflow/plot.py b/examples/python-operator-dataflow/plot.py new file mode 100755 index 00000000..57a2a293 --- /dev/null +++ b/examples/python-operator-dataflow/plot.py @@ -0,0 +1,86 @@ +import os +from enum import Enum +from typing import Callable + +import cv2 +import numpy as np + +from utils import LABELS + +CI = os.environ.get("CI") + +font = cv2.FONT_HERSHEY_SIMPLEX + + +class DoraStatus(Enum): + CONTINUE = 0 + STOP = 1 + + +class Operator: + """ + Plot image and bounding box + """ + + def __init__(self): + self.image = [] + self.bboxs = [] + + def on_input( + self, + dora_input: dict, + send_output: Callable[[str, bytes], None], + ) -> DoraStatus: + """ + Put image and bounding box on cv2 window. + + Args: + dora_input["id"] (str): Id of the dora_input declared in the yaml configuration + dora_input["data"] (bytes): Bytes message of the dora_input + send_output (Callable[[str, bytes]]): Function enabling sending output back to dora. + """ + if dora_input["id"] == "image": + frame = np.frombuffer(dora_input["data"], dtype="uint8") + frame = cv2.imdecode(frame, -1) + self.image = frame + + elif dora_input["id"] == "bbox" and len(self.image) != 0: + bboxs = np.frombuffer(dora_input["data"], dtype="float32") + self.bboxs = np.reshape(bboxs, (-1, 6)) + for bbox in self.bboxs: + [ + min_x, + min_y, + max_x, + max_y, + confidence, + label, + ] = bbox + cv2.rectangle( + self.image, + (int(min_x), int(min_y)), + (int(max_x), int(max_y)), + (0, 255, 0), + 2, + ) + + cv2.putText( + self.image, + LABELS[int(label)] + f", {confidence:0.2f}", + (int(max_x), int(max_y)), + font, + 0.75, + (0, 255, 0), + 2, + 1, + ) + + if CI != "true": + cv2.imshow("frame", self.image) + if cv2.waitKey(1) & 0xFF == ord("q"): + return DoraStatus.STOP + + return DoraStatus.CONTINUE + + def __del__(self): + cv2.destroyAllWindows() diff --git a/examples/python-operator-dataflow/requirements.txt b/examples/python-operator-dataflow/requirements.txt new file mode 100644 index 00000000..55f71178 --- /dev/null +++ b/examples/python-operator-dataflow/requirements.txt @@ -0,0 +1,45 @@ +# YOLOv5 requirements +# Usage: pip install -r requirements.txt + +# Base ---------------------------------------- +matplotlib>=3.2.2 +numpy>=1.18.5 +opencv-python>=4.1.1 +Pillow>=7.1.2 +PyYAML>=5.3.1 +requests>=2.23.0 +scipy>=1.4.1 +torch>=1.7.0 +torchvision>=0.8.1 +tqdm>=4.64.0 +protobuf<=3.20.1 # https://github.com/ultralytics/yolov5/issues/8012 + +# Logging ------------------------------------- +tensorboard>=2.4.1 +# wandb +# clearml + +# Plotting ------------------------------------ +pandas>=1.1.4 +seaborn>=0.11.0 + +# Export -------------------------------------- +# coremltools>=5.2 # CoreML export +# onnx>=1.9.0 # ONNX export +# onnx-simplifier>=0.4.1 # ONNX simplifier +# nvidia-pyindex # TensorRT export +# nvidia-tensorrt # TensorRT export +# scikit-learn==0.19.2 # CoreML quantization +# tensorflow>=2.4.1 # TFLite export (or tensorflow-cpu, tensorflow-aarch64) +# tensorflowjs>=3.9.0 # TF.js export +# openvino-dev # OpenVINO export + +# Extras -------------------------------------- +ipython # interactive notebook +psutil # system utilization +thop>=0.1.1 # FLOPs computation +# albumentations>=1.0.3 +# pycocotools>=2.0 # COCO mAP +# roboflow + +opencv-python>=4.1.1 diff --git a/examples/python-operator-dataflow/run.rs b/examples/python-operator-dataflow/run.rs new file mode 100644 index 00000000..ac32ff00 --- /dev/null +++ b/examples/python-operator-dataflow/run.rs @@ -0,0 +1,46 @@ +use eyre::{bail, Context}; +use std::{env, path::Path}; + +#[tokio::main] +async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing subscriber")?; + + let root = Path::new(env!("CARGO_MANIFEST_DIR")); + std::env::set_current_dir(root.join(file!()).parent().unwrap()) + .wrap_err("failed to set working dir")?; + + build_package("dora-daemon").await?; + + run(root).await?; + + Ok(()) +} + +async fn build_package(package: &str) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("build"); + cmd.arg("--package").arg(package); + if !cmd.status().await?.success() { + bail!("failed to build {package}"); + }; + Ok(()) +} + +async fn run(_root: &Path) -> eyre::Result<()> { + let mut run = tokio::process::Command::new("sh"); + run.arg("./run.sh"); + if !run.status().await?.success() { + bail!("failed to run python example."); + }; + Ok(()) +} + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer().pretty(); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} diff --git a/examples/python-operator-dataflow/run.sh b/examples/python-operator-dataflow/run.sh new file mode 100644 index 00000000..4a8ac435 --- /dev/null +++ b/examples/python-operator-dataflow/run.sh @@ -0,0 +1,15 @@ +set -e + +python3 -m venv .env +. $(pwd)/.env/bin/activate +# Dev dependencies +pip install maturin +cd ../../apis/python/node +maturin develop +cd ../../../examples/python-operator-dataflow + +# Dependencies +pip install --upgrade pip +pip install -r requirements.txt + +cargo run -p dora-daemon -- --run-dataflow dataflow_without_webcam.yml diff --git a/examples/python-operator-dataflow/utils.py b/examples/python-operator-dataflow/utils.py new file mode 100644 index 00000000..dabc915e --- /dev/null +++ b/examples/python-operator-dataflow/utils.py @@ -0,0 +1,82 @@ +LABELS = [ + "ABC", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", +] diff --git a/examples/python-operator-dataflow/webcam.py b/examples/python-operator-dataflow/webcam.py new file mode 100755 index 00000000..cbcaedfc --- /dev/null +++ b/examples/python-operator-dataflow/webcam.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import time + +import cv2 +from dora import Node + +node = Node() + +video_capture = cv2.VideoCapture(0) + +start = time.time() + +# Run for 20 seconds +while time.time() - start < 10: + # Wait next dora_input + event = node.next() + match event["type"]: + case "INPUT": + ret, frame = video_capture.read() + if ret: + node.send_output("image", cv2.imencode(".jpg", frame)[1].tobytes()) + case "STOP": + print("received stop") + break + case other: + print("received unexpected event:", other) + break + +video_capture.release() From 67976d2c4161edacc0002731c2b07f88861795bd Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 15:16:10 +0100 Subject: [PATCH 148/225] Add some Debug derives --- binaries/runtime/src/operator/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index e590cea1..e48d2ace 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -71,6 +71,7 @@ pub fn run_operator( Ok(()) } +#[derive(Debug)] pub enum OperatorEvent { Output { output_id: DataId, @@ -84,6 +85,7 @@ pub enum OperatorEvent { }, } +#[derive(Debug)] pub enum IncomingEvent { Stop, Input { From 63dd2ff03d150bdbe0be46f16c4e30257e495f81 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 16:08:01 +0100 Subject: [PATCH 149/225] Fix `_pthread_rwlock_timedrdlock` link error on macOS --- libraries/shared-memory-server/Cargo.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libraries/shared-memory-server/Cargo.toml b/libraries/shared-memory-server/Cargo.toml index 3fefc942..7c66afe6 100644 --- a/libraries/shared-memory-server/Cargo.toml +++ b/libraries/shared-memory-server/Cargo.toml @@ -10,6 +10,7 @@ license = "Apache-2.0" eyre = "0.6.8" serde = { version = "1.0.152", features = ["derive"] } shared_memory = "0.12.0" -raw_sync = "0.1.5" +# TODO use upstream release once https://github.com/elast0ny/raw_sync-rs/pull/29 is merged +raw_sync = { git = "https://github.com/cameronelliott/raw_sync-rs.git" } bincode = "1.3.3" tracing = "0.1.37" From a685b962e69317418e0b62b18b43b9edf3daa758 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 16:08:39 +0100 Subject: [PATCH 150/225] Run python operator dataflow example on CI --- .github/workflows/ci-python.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index 2a76bd60..8b7d3f13 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -29,3 +29,6 @@ jobs: - name: "Python Dataflow example" run: cargo run --example python-dataflow + + - name: "Python Operator Dataflow example" + run: cargo run --example python-operator-dataflow From c0fd896df48a4cdaa7e4fb72b2ec1d2e6fe72fd6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 16:09:28 +0100 Subject: [PATCH 151/225] Also close inputs when node finishes without sending `Stopped` message --- binaries/daemon/src/lib.rs | 66 +++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index a60db6c0..27ac6b99 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -400,39 +400,44 @@ impl Daemon { let _ = reply_sender.send(DaemonReply::Result(Ok(()))); - // notify downstream nodes - let dataflow = self - .running - .get_mut(&dataflow_id) - .wrap_err_with(|| format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`"))?; - send_input_closed_events(dataflow, |(source_id, _)| source_id == &node_id).await; - - // TODO: notify remote nodes + self.handle_node_stop(dataflow_id, &node_id).await?; + } + } + Ok(()) + } - dataflow.running_nodes.remove(&node_id); - if dataflow.running_nodes.is_empty() { - tracing::info!( - "Dataflow `{dataflow_id}` finished on machine `{}`", - self.machine_id - ); - if let Some(addr) = self.coordinator_addr { - if coordinator::send_event( - addr, - self.machine_id.clone(), - DaemonEvent::AllNodesFinished { - dataflow_id, - result: Ok(()), - }, - ) - .await - .is_err() - { - tracing::warn!("failed to report dataflow finish to coordinator"); - } - } - self.running.remove(&dataflow_id); + #[tracing::instrument(skip(self))] + async fn handle_node_stop( + &mut self, + dataflow_id: Uuid, + node_id: &NodeId, + ) -> Result<(), eyre::ErrReport> { + let dataflow = self.running.get_mut(&dataflow_id).wrap_err_with(|| { + format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`") + })?; + send_input_closed_events(dataflow, |(source_id, _)| source_id == node_id).await; + dataflow.running_nodes.remove(node_id); + if dataflow.running_nodes.is_empty() { + tracing::info!( + "Dataflow `{dataflow_id}` finished on machine `{}`", + self.machine_id + ); + if let Some(addr) = self.coordinator_addr { + if coordinator::send_event( + addr, + self.machine_id.clone(), + DaemonEvent::AllNodesFinished { + dataflow_id, + result: Ok(()), + }, + ) + .await + .is_err() + { + tracing::warn!("failed to report dataflow finish to coordinator"); } } + self.running.remove(&dataflow_id); } Ok(()) } @@ -494,6 +499,7 @@ impl Daemon { tracing::warn!( "node `{dataflow_id}/{node_id}` finished without sending `Stopped` message" ); + self.handle_node_stop(dataflow_id, &node_id).await?; } match result { Ok(()) => { From 1c15fa7af897a9fa8dc6e5af9362f52ede18b153 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 16:09:54 +0100 Subject: [PATCH 152/225] Fix check for `Stopped` message --- binaries/daemon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 27ac6b99..0b8c2dd8 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -493,7 +493,7 @@ impl Daemon { if self .running .get(&dataflow_id) - .and_then(|d| d.subscribe_channels.get(&node_id)) + .and_then(|d| d.running_nodes.get(&node_id)) .is_some() { tracing::warn!( From a82a65b7d3471df7f6664509fbf98c5aa64bbf31 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 21 Feb 2023 16:16:33 +0100 Subject: [PATCH 153/225] Update Cargo.lock --- Cargo.lock | 91 +++++++++++++++++++++--------------------------------- 1 file changed, 35 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8f807965..f20a4d99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14,7 +14,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cipher", "cpufeatures", "opaque-debug 0.3.0", @@ -141,7 +141,7 @@ checksum = "83137067e3a2a6a06d67168e49e68a0957d215410473a740cea95a2425c0b7c6" dependencies = [ "async-io", "blocking", - "cfg-if 1.0.0", + "cfg-if", "event-listener", "futures-lite", "libc", @@ -430,12 +430,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -682,7 +676,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -691,7 +685,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -701,7 +695,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] @@ -713,10 +707,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ "autocfg 1.1.0", - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", "lazy_static", - "memoffset 0.6.5", + "memoffset", "scopeguard", ] @@ -726,7 +720,7 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "lazy_static", ] @@ -887,7 +881,7 @@ version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "num_cpus", ] @@ -925,7 +919,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "dirs-sys-next", ] @@ -1194,7 +1188,6 @@ name = "dora-runtime" version = "0.1.2" dependencies = [ "clap 3.2.20", - "ctrlc", "dora-core", "dora-download", "dora-message", @@ -1254,7 +1247,7 @@ version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -1525,7 +1518,7 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "libc", "wasi 0.10.2+wasi-snapshot-preview1", @@ -1829,7 +1822,7 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -1966,7 +1959,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "winapi", ] @@ -2006,7 +1999,7 @@ version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "value-bag", ] @@ -2062,15 +2055,6 @@ dependencies = [ "autocfg 1.1.0", ] -[[package]] -name = "memoffset" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" -dependencies = [ - "autocfg 1.1.0", -] - [[package]] name = "mime" version = "0.3.16" @@ -2145,7 +2129,7 @@ name = "napi-build" version = "1.0.1" source = "git+https://github.com/getditto/napi-rs?branch=ditto/closure-into-jsfunction#da095cc3f1af133344083b525d7e9763b347e249" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "ureq", ] @@ -2309,9 +2293,9 @@ checksum = "e4916f159ed8e5de0082076562152a76b7a1f64a01fd9d1e0fea002c37624faf" dependencies = [ "bitflags", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", - "memoffset 0.6.5", + "memoffset", ] [[package]] @@ -2322,9 +2306,9 @@ checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" dependencies = [ "bitflags", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", - "memoffset 0.6.5", + "memoffset", ] [[package]] @@ -2334,10 +2318,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags", - "cfg-if 1.0.0", + "cfg-if", "libc", - "memoffset 0.7.1", - "pin-utils", "static_assertions", ] @@ -2483,7 +2465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12fc0523e3bd51a692c8850d075d74dc062ccf251c0110668cbd921917118a13" dependencies = [ "bitflags", - "cfg-if 1.0.0", + "cfg-if", "foreign-types", "libc", "once_cell", @@ -2649,7 +2631,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall", "smallvec", @@ -2886,7 +2868,7 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "685404d509889fade3e86fe3a5803bca2ec09b0c0778d5ada6ec8bf7a8de5259" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "log", "wepoll-ffi", @@ -3018,7 +3000,7 @@ version = "0.16.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e6302e85060011447471887705bb7838f14aba43fcb06957d823739a496b3dc" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "eyre", "indoc", "libc", @@ -3182,12 +3164,11 @@ dependencies = [ [[package]] name = "raw_sync" version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a34bde3561f980a51c70495164200569a11662644fe5af017f0b5d7015688cc" +source = "git+https://github.com/cameronelliott/raw_sync-rs.git#b1d6e16381b498fe618e5c1a1d1f2b1d2e6ef019" dependencies = [ - "cfg-if 0.1.10", + "cfg-if", "libc", - "nix 0.26.2", + "nix 0.23.1", "rand", "winapi", ] @@ -3372,8 +3353,6 @@ version = "0.1.2" dependencies = [ "dora-node-api", "eyre", - "tracing", - "tracing-subscriber", ] [[package]] @@ -3682,7 +3661,7 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "681a9e90340f748af3a1cc52eb2c040eee29f976b763e99ad90fc0c5df6f9791" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "nix 0.22.3", "rand", @@ -3787,7 +3766,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af91f480ee899ab2d9f8435bfdfc14d08a5754bd9d3fef1f1a1c23336aad6c8b" dependencies = [ "async-channel", - "cfg-if 1.0.0", + "cfg-if", "futures-core", "pin-project-lite", ] @@ -3839,7 +3818,7 @@ version = "0.24.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d80929a3b477bce3a64360ca82bfb361eacce1dcb7b1fb31e8e5e181e37c212" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "core-foundation-sys", "libc", "ntapi", @@ -3866,7 +3845,7 @@ version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "fastrand", "libc", "redox_syscall", @@ -4154,7 +4133,7 @@ version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -4488,7 +4467,7 @@ version = "0.2.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "serde", "serde_json", "wasm-bindgen-macro", @@ -4515,7 +4494,7 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "wasm-bindgen", "web-sys", From ea2c39a26c3e1364471c8c1f57bd01ad94695d07 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 10:48:40 +0100 Subject: [PATCH 154/225] Remove unused runtime dependencies --- Cargo.lock | 16 ++-------------- binaries/runtime/Cargo.toml | 7 +------ 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f20a4d99..7a64dae9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1187,7 +1187,6 @@ dependencies = [ name = "dora-runtime" version = "0.1.2" dependencies = [ - "clap 3.2.20", "dora-core", "dora-download", "dora-message", @@ -1197,8 +1196,6 @@ dependencies = [ "dora-operator-api-types", "dora-tracing", "eyre", - "fern", - "flume", "futures", "futures-concurrency", "libloading", @@ -1210,8 +1207,6 @@ dependencies = [ "tokio-stream", "tracing", "tracing-subscriber", - "zenoh", - "zenoh-config", ] [[package]] @@ -1294,15 +1289,6 @@ dependencies = [ "instant", ] -[[package]] -name = "fern" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd7b0849075e79ee9a1836df22c717d1eba30451796fdc631b04565dd11e2a" -dependencies = [ - "log", -] - [[package]] name = "fixedbitset" version = "0.4.1" @@ -3353,6 +3339,8 @@ version = "0.1.2" dependencies = [ "dora-node-api", "eyre", + "tracing", + "tracing-subscriber", ] [[package]] diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index c94bb672..b916ad82 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -7,7 +7,6 @@ license = "Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -clap = { version = "3.1.12", features = ["derive"] } dora-node-api = { path = "../../apis/rust/node", default-features = false } dora-operator-api-python = { path = "../../apis/python/operator" } dora-operator-api-types = { path = "../../apis/rust/operator/types" } @@ -26,12 +25,8 @@ libloading = "0.7.3" serde_yaml = "0.8.23" tokio = { version = "1.17.0", features = ["full"] } tokio-stream = "0.1.8" -zenoh = { git = "https://github.com/eclipse-zenoh/zenoh.git", rev = "79a136e4fd90b11ff5d775ced981af53c4f1071b" } -zenoh-config = { git = "https://github.com/eclipse-zenoh/zenoh.git", rev = "79a136e4fd90b11ff5d775ced981af53c4f1071b" } -fern = "0.6.1" -pyo3 = { version = "0.16", features = ["eyre", "abi3-py37"] } # pyo3-abi3 flag allow simpler linking. See: https://pyo3.rs/v0.13.2/building_and_distribution.html -flume = "0.10.14" +pyo3 = { version = "0.16", features = ["eyre", "abi3-py37"] } dora-message = { path = "../../libraries/message" } tracing = "0.1.36" tracing-subscriber = "0.3.15" From e3e55ca76820c55dbc67ad383e96a8d9a52c03cd Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 11:40:51 +0100 Subject: [PATCH 155/225] Exit immediately on second ctrl-c signal --- Cargo.lock | 3 +-- binaries/daemon/src/lib.rs | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a64dae9..57b74776 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1187,6 +1187,7 @@ dependencies = [ name = "dora-runtime" version = "0.1.2" dependencies = [ + "ctrlc", "dora-core", "dora-download", "dora-message", @@ -3339,8 +3340,6 @@ version = "0.1.2" dependencies = [ "dora-node-api", "eyre", - "tracing", - "tracing-subscriber", ] [[package]] diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 0b8c2dd8..7cb1ebba 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -14,8 +14,9 @@ use futures::{future, stream, FutureExt, TryFutureExt}; use futures_concurrency::stream::Merge; use shared_mem_handler::SharedMemSample; use std::{ + borrow::Cow, collections::{BTreeMap, BTreeSet, HashMap}, - fmt, + fmt, io, net::SocketAddr, path::{Path, PathBuf}, time::{Duration, Instant}, @@ -118,10 +119,17 @@ impl Daemon { ) -> eyre::Result<()> { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let ctrlc_tx = dora_events_tx.clone(); + let mut ctrlc_sent = false; ctrlc::set_handler(move || { - tracing::info!("received ctrc signal"); - if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { - tracing::error!("failed to report ctrl-c event to dora-daemon"); + if ctrlc_sent { + tracing::warn!("received second ctrc signal -> aborting immediately"); + std::process::abort(); + } else { + tracing::info!("received ctrc signal"); + if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { + tracing::error!("failed to report ctrl-c event to dora-daemon"); + } + ctrlc_sent = true; } }) .wrap_err("failed to set ctrl-c handler")?; From 4ca7cc6666e99320005526554bc219be86c3c4df Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 11:42:31 +0100 Subject: [PATCH 156/225] Improve handling of stopped nodes - don't fail daemon when a node errors - detects when a node exited because of a signal - improve log messages --- binaries/daemon/src/lib.rs | 104 +++++++++++++++++++++++++++++------ binaries/daemon/src/spawn.rs | 19 ++----- 2 files changed, 90 insertions(+), 33 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 7cb1ebba..67c40633 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -496,31 +496,66 @@ impl Daemon { DoraEvent::SpawnedNodeResult { dataflow_id, node_id, - result, + exit_status, } => { + let mut signal_exit = false; + match exit_status { + NodeExitStatus::Success => { + tracing::info!("node {dataflow_id}/{node_id} finished successfully"); + } + NodeExitStatus::IoError(err) => { + let err = eyre!(err).wrap_err(format!( + "I/O error while waiting for node `{dataflow_id}/{node_id}`" + )); + tracing::error!("{err:?}",); + } + NodeExitStatus::ExitCode(code) => { + tracing::warn!( + "node {dataflow_id}/{node_id} finished with exit code {code}" + ); + } + NodeExitStatus::Signal(signal) => { + signal_exit = true; + let signal: Cow<_> = match signal { + 1 => "SIGHUP".into(), + 2 => "SIGINT".into(), + 3 => "SIGQUIT".into(), + 4 => "SIGILL".into(), + 6 => "SIGABRT".into(), + 8 => "SIGFPE".into(), + 9 => "SIGKILL".into(), + 11 => "SIGSEGV".into(), + 13 => "SIGPIPE".into(), + 14 => "SIGALRM".into(), + 15 => "SIGTERM".into(), + 22 => "SIGABRT".into(), + 23 => "NSIG".into(), + + other => other.to_string().into(), + }; + tracing::warn!( + "node {dataflow_id}/{node_id} finished because of signal `{signal}`" + ); + } + NodeExitStatus::Unknown => { + tracing::warn!( + "node {dataflow_id}/{node_id} finished with unknown exit code" + ); + } + } + if self .running .get(&dataflow_id) .and_then(|d| d.running_nodes.get(&node_id)) .is_some() { - tracing::warn!( - "node `{dataflow_id}/{node_id}` finished without sending `Stopped` message" - ); - self.handle_node_stop(dataflow_id, &node_id).await?; - } - match result { - Ok(()) => { - tracing::info!("node {dataflow_id}/{node_id} finished successfully"); - } - Err(err) => { - let err = err.wrap_err(format!("error in node `{dataflow_id}/{node_id}`")); - if self.exit_when_done.is_some() { - bail!(err); - } else { - tracing::error!("{err:?}",); - } + if !signal_exit { + tracing::warn!( + "node `{dataflow_id}/{node_id}` finished without sending `Stopped` message" + ); } + self.handle_node_stop(dataflow_id, &node_id).await?; } if let Some(exit_when_done) = &mut self.exit_when_done { @@ -786,10 +821,43 @@ pub enum DoraEvent { SpawnedNodeResult { dataflow_id: DataflowId, node_id: NodeId, - result: eyre::Result<()>, + exit_status: NodeExitStatus, }, } +#[derive(Debug)] +pub enum NodeExitStatus { + Success, + IoError(io::Error), + ExitCode(i32), + Signal(i32), + Unknown, +} + +impl From> for NodeExitStatus { + fn from(result: Result) -> Self { + match result { + Ok(status) => { + if status.success() { + NodeExitStatus::Success + } else if let Some(code) = status.code() { + Self::ExitCode(code) + } else { + #[cfg(unix)] + { + use std::os::unix::process::ExitStatusExt; + if let Some(signal) = status.signal() { + return Self::Signal(signal); + } + } + Self::Unknown + } + } + Err(err) => Self::IoError(err), + } + } +} + type MessageId = String; #[must_use] diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index d2c7170e..c32bd32c 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -1,6 +1,6 @@ use crate::{ listener::spawn_listener_loop, runtime_node_inputs, runtime_node_outputs, shared_mem_handler, - DoraEvent, Event, + DoraEvent, Event, NodeExitStatus, }; use dora_core::{ config::NodeRunConfig, @@ -126,23 +126,12 @@ pub async fn spawn_node( } }; - let node_id_cloned = node_id.clone(); - let wait_task = async move { - let status = child.wait().await.context("child process failed")?; - if status.success() { - Ok(()) - } else if let Some(code) = status.code() { - Err(eyre!("node {node_id} failed with exit code: {code}")) - } else { - Err(eyre!("node {node_id} failed (unknown exit code)")) - } - }; tokio::spawn(async move { - let result = wait_task.await; + let exit_status = NodeExitStatus::from(child.wait().await); let event = DoraEvent::SpawnedNodeResult { dataflow_id, - node_id: node_id_cloned, - result, + node_id, + exit_status, }; let _ = daemon_tx.send(event.into()).await; }); From 9e2dca0e821dcb1d80f8013f1408984c275eea49 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 11:44:20 +0100 Subject: [PATCH 157/225] Don't error if listener exits because of `ConnectionReset` error --- binaries/daemon/src/listener/tcp.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/binaries/daemon/src/listener/tcp.rs b/binaries/daemon/src/listener/tcp.rs index cc216cca..88c8aee7 100644 --- a/binaries/daemon/src/listener/tcp.rs +++ b/binaries/daemon/src/listener/tcp.rs @@ -1,3 +1,5 @@ +use std::io::ErrorKind; + use super::Listener; use crate::{ shared_mem_handler, @@ -58,9 +60,9 @@ impl super::Connection for TcpConnection { let raw = match tcp_receive(&mut self.0).await { Ok(raw) => raw, Err(err) => match err.kind() { - std::io::ErrorKind::UnexpectedEof | std::io::ErrorKind::ConnectionAborted => { - return Ok(None) - } + ErrorKind::UnexpectedEof + | ErrorKind::ConnectionAborted + | ErrorKind::ConnectionReset => return Ok(None), _other => { return Err(err) .context("unexpected I/O error while trying to receive DaemonRequest") From f43809f9936ac75698dca07b240222e0d8b81df8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 11:44:58 +0100 Subject: [PATCH 158/225] Reduce log output --- binaries/daemon/src/listener/mod.rs | 2 +- examples/rust-dataflow/run.rs | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/binaries/daemon/src/listener/mod.rs b/binaries/daemon/src/listener/mod.rs index c0803853..7b3d6812 100644 --- a/binaries/daemon/src/listener/mod.rs +++ b/binaries/daemon/src/listener/mod.rs @@ -184,7 +184,7 @@ where { Ok(Some(m)) => m, Ok(None) => { - tracing::info!( + tracing::debug!( "channel disconnected: {}/{}", self.dataflow_id, self.node_id diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index cddc0ddc..37f26473 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -1,5 +1,7 @@ use eyre::{bail, Context}; use std::path::Path; +use tracing::metadata::LevelFilter; +use tracing_subscriber::Layer; #[tokio::main] async fn main() -> eyre::Result<()> { @@ -32,7 +34,9 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { fn set_up_tracing() -> eyre::Result<()> { use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; - let stdout_log = tracing_subscriber::fmt::layer().pretty(); + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(LevelFilter::DEBUG); let subscriber = tracing_subscriber::Registry::default().with(stdout_log); tracing::subscriber::set_global_default(subscriber) .context("failed to set tracing global subscriber") From f00e27bcf06cad489e869625a057d98e1875b8b0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 12:23:51 +0100 Subject: [PATCH 159/225] Limit enum size by boxing large fields --- Cargo.lock | 1 - binaries/daemon/src/lib.rs | 7 +++++-- binaries/daemon/src/shared_mem_handler.rs | 9 ++++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 57b74776..4d806445 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1187,7 +1187,6 @@ dependencies = [ name = "dora-runtime" version = "0.1.2" dependencies = [ - "ctrlc", "dora-core", "dora-download", "dora-message", diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 67c40633..444ec87f 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -642,7 +642,10 @@ impl Daemon { if let Some(data) = data { if let Err(err) = self .shared_memory_handler - .send_async(shared_mem_handler::DaemonEvent::SentOut { data, drop_tokens }) + .send_async(shared_mem_handler::DaemonEvent::SentOut { + data: *data, + drop_tokens, + }) .await .wrap_err("shared mem handler crashed after send out") { @@ -782,7 +785,7 @@ pub enum ShmemHandlerEvent { node_id: NodeId, output_id: DataId, metadata: dora_message::Metadata<'static>, - data: Option, + data: Option>, }, HandlerError(eyre::ErrReport), } diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 8a983ae5..2a88ef4a 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -159,9 +159,12 @@ impl SharedMemHandler { metadata, data, } = message; - let data = data.map(|(m, len)| SharedMemSample { - shared_memory: m, - len, + let data = data.map(|(m, len)| { + SharedMemSample { + shared_memory: m, + len, + } + .into() }); let send_result = self From 5e4f1b2519a420b6c164d9e4c728f9cd98e972a1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 12:24:11 +0100 Subject: [PATCH 160/225] Clean up --- binaries/daemon/src/shared_mem_handler.rs | 7 ------- binaries/daemon/src/spawn.rs | 2 +- binaries/runtime/src/lib.rs | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 2a88ef4a..343aee1b 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -2,7 +2,6 @@ use core::fmt; use std::{ collections::{HashMap, HashSet}, sync::Arc, - time::Instant, }; use dora_core::{ @@ -63,16 +62,10 @@ impl SharedMemHandler { ) .merge(); while let Some(event) = events.next().await { - let start = Instant::now(); - match event { Event::Node(event) => self.handle_node_event(event).await?, Event::Daemon(event) => self.handle_daemon_event(event).await?, } - let elapsed = start.elapsed(); - // if elapsed.as_micros() > 10 { - // tracing::debug!("handled event in {elapsed:?}: {event_debug}"); - // } } Ok(()) } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index c32bd32c..a5f83789 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -8,7 +8,7 @@ use dora_core::{ descriptor::{resolve_path, source_is_url, OperatorSource, ResolvedNode}, }; use dora_download::download_file; -use eyre::{eyre, WrapErr}; +use eyre::WrapErr; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; use tokio::sync::mpsc; diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index f390d9ed..5c417878 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -11,7 +11,6 @@ use futures::{Stream, StreamExt}; use futures_concurrency::stream::Merge; use operator::{run_operator, OperatorEvent, StopReason}; -use core::fmt; use std::{ collections::{BTreeSet, HashMap}, mem, From 8dc184a7e8df0c659656cfa7ce4806458e2cc854 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 12:24:18 +0100 Subject: [PATCH 161/225] Improve a log message --- binaries/runtime/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 5c417878..4abcc3ec 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -227,7 +227,7 @@ async fn run( open_inputs.remove(&input_id); if open_inputs.is_empty() { // all inputs of the node were closed -> close its event channel - tracing::info!("all inputs of operator {operator_id} were closed -> closing event channel"); + tracing::info!("all inputs of operator {}/{operator_id} were closed -> closing event channel", node.id()); open_operator_inputs.remove(&operator_id); operator_channels.remove(&operator_id); } From 2a86213e7036d8d53d5f6e8b49e7f27f54874ebe Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 14:30:57 +0100 Subject: [PATCH 162/225] Cache shared memory regions in daemon and reuse them if size matches --- binaries/daemon/src/shared_mem_handler.rs | 59 +++++++++++++++++------ 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index 343aee1b..a9707b25 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -1,6 +1,6 @@ use core::fmt; use std::{ - collections::{HashMap, HashSet}, + collections::{HashMap, HashSet, VecDeque}, sync::Arc, }; @@ -23,6 +23,8 @@ pub struct SharedMemHandler { prepared_messages: HashMap, sent_out_shared_memory: HashMap>, dropped: HashSet, + + cache: VecDeque, } impl SharedMemHandler { @@ -32,6 +34,7 @@ impl SharedMemHandler { prepared_messages: HashMap::new(), sent_out_shared_memory: HashMap::new(), dropped: HashSet::new(), + cache: VecDeque::new(), } } @@ -77,12 +80,7 @@ impl SharedMemHandler { match self.sent_out_shared_memory.remove(&token) { Some(arc) => { if let Ok(shmem) = Arc::try_unwrap(arc) { - tokio::task::spawn_blocking(move || { - tracing::trace!( - "freeing shared memory after receiving last drop token" - ); - std::mem::drop(shmem); - }); + self.add_to_cache(shmem); } } None => { @@ -110,12 +108,28 @@ impl SharedMemHandler { ); let memory = if data_len > 0 { - Some(ShmemHandle( - ShmemConf::new() - .size(data_len) - .create() - .wrap_err("failed to allocate shared memory")?, - )) + let cache_index = self + .cache + .iter() + .enumerate() + .rev() + .filter(|(_, s)| s.size() >= data_len) + .min_by_key(|(_, s)| s.size()) + .map(|(i, _)| i); + let memory = match cache_index { + Some(i) => { + // we know that this index exists, so we can safely unwrap here + self.cache.remove(i).unwrap() + } + None => ShmemHandle(Box::new( + ShmemConf::new() + .size(data_len) + .create() + .wrap_err("failed to allocate shared memory")?, + )), + }; + assert!(memory.size() >= data_len); + Some(memory) } else { None }; @@ -191,10 +205,22 @@ impl SharedMemHandler { .insert(drop_token, memory.clone()); } } + if let Ok(memory) = Arc::try_unwrap(memory) { + self.add_to_cache(memory); + } } } Ok(()) } + + fn add_to_cache(&mut self, memory: ShmemHandle) { + const MAX_CACHE_SIZE: usize = 20; + + self.cache.push_back(memory); + while self.cache.len() > MAX_CACHE_SIZE { + self.cache.pop_front(); + } + } } pub struct SharedMemSample { @@ -272,7 +298,12 @@ struct PreparedMessage { data: Option<(ShmemHandle, usize)>, } -struct ShmemHandle(Shmem); +struct ShmemHandle(Box); +impl ShmemHandle { + fn size(&self) -> usize { + self.0.len() + } +} unsafe impl Send for ShmemHandle {} unsafe impl Sync for ShmemHandle {} From 1553a198efa8c630a144d249e66e936e9d3b7621 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 14:45:27 +0100 Subject: [PATCH 163/225] Re-export dora-message from dora-core --- Cargo.lock | 4 ---- apis/rust/node/Cargo.toml | 5 ++--- apis/rust/node/src/daemon/mod.rs | 6 +++--- apis/rust/node/src/lib.rs | 2 +- binaries/coordinator/Cargo.toml | 1 - binaries/daemon/Cargo.toml | 1 - binaries/daemon/src/lib.rs | 8 ++++---- binaries/daemon/src/shared_mem_handler.rs | 4 ++-- binaries/runtime/Cargo.toml | 1 - binaries/runtime/src/lib.rs | 6 +++--- binaries/runtime/src/operator/mod.rs | 2 +- libraries/core/src/lib.rs | 2 ++ 12 files changed, 18 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4d806445..7dd0050f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -962,7 +962,6 @@ dependencies = [ "communication-layer-request-reply", "dora-core", "dora-download", - "dora-message", "dora-node-api", "eyre", "futures", @@ -1008,7 +1007,6 @@ dependencies = [ "ctrlc", "dora-core", "dora-download", - "dora-message", "eyre", "flume", "futures", @@ -1078,7 +1076,6 @@ dependencies = [ "bincode", "capnp", "dora-core", - "dora-message", "eyre", "flume", "once_cell", @@ -1189,7 +1186,6 @@ version = "0.1.2" dependencies = [ "dora-core", "dora-download", - "dora-message", "dora-metrics", "dora-node-api", "dora-operator-api-python", diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index 0775b20d..56cad481 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -9,6 +9,8 @@ default = ["tracing-subscriber"] tracing-subscriber = ["dep:tracing-subscriber"] [dependencies] +dora-core = { path = "../../../libraries/core" } +shared-memory-server = { path = "../../../libraries/shared-memory-server" } eyre = "0.6.7" once_cell = "1.13.0" serde = { version = "1.0.136", features = ["derive"] } @@ -20,9 +22,6 @@ tracing-subscriber = { version = "0.3.15", optional = true } flume = "0.10.14" uuid = { version = "1.1.2", features = ["v4"] } capnp = "0.14.11" -dora-message = { path = "../../../libraries/message" } -dora-core = { path = "../../../libraries/core" } -shared-memory-server = { path = "../../../libraries/shared-memory-server" } bincode = "1.3.3" [dev-dependencies] diff --git a/apis/rust/node/src/daemon/mod.rs b/apis/rust/node/src/daemon/mod.rs index 0b2ef9f2..799848f7 100644 --- a/apis/rust/node/src/daemon/mod.rs +++ b/apis/rust/node/src/daemon/mod.rs @@ -1,8 +1,8 @@ use dora_core::{ config::{DataId, NodeId}, daemon_messages::{DaemonCommunication, DaemonReply, DaemonRequest, DataflowId, NodeEvent}, + message::Metadata, }; -use dora_message::Metadata; use eyre::{bail, eyre, Context}; use shared_memory_server::{Shmem, ShmemClient, ShmemConf}; use std::{marker::PhantomData, net::TcpStream, thread::JoinHandle, time::Duration}; @@ -104,7 +104,7 @@ impl ControlChannel { pub fn prepare_message( &mut self, output_id: DataId, - metadata: dora_message::Metadata<'static>, + metadata: Metadata<'static>, data_len: usize, ) -> eyre::Result { let reply = self @@ -142,7 +142,7 @@ impl ControlChannel { pub fn send_empty_message( &mut self, output_id: DataId, - metadata: dora_message::Metadata<'static>, + metadata: Metadata<'static>, ) -> eyre::Result<()> { let reply = self .channel diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 8bb22514..64b3fe20 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -2,11 +2,11 @@ use std::thread::JoinHandle; use daemon::{ControlChannel, DaemonConnection, EventStream}; pub use dora_core; +pub use dora_core::message::{uhlc, Metadata, MetadataParameters}; use dora_core::{ config::{DataId, NodeId, NodeRunConfig}, daemon_messages::NodeConfig, }; -pub use dora_message::{uhlc, Metadata, MetadataParameters}; use eyre::WrapErr; pub use flume::Receiver; use shared_memory_server::ShmemConf; diff --git a/binaries/coordinator/Cargo.toml b/binaries/coordinator/Cargo.toml index 1a75c20f..c1ee3e18 100644 --- a/binaries/coordinator/Cargo.toml +++ b/binaries/coordinator/Cargo.toml @@ -21,7 +21,6 @@ uuid = { version = "1.2.1" } time = "0.3.9" rand = "0.8.5" dora-core = { workspace = true } -dora-message = { path = "../../libraries/message" } tracing = "0.1.36" tracing-subscriber = "0.3.15" futures-concurrency = "7.1.0" diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 0945a616..b50484de 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -15,7 +15,6 @@ futures-concurrency = "7.1.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.86" dora-core = { path = "../../libraries/core" } -dora-message = { path = "../../libraries/message" } flume = "0.10.14" dora-download = { path = "../../libraries/extensions/download" } serde_yaml = "0.8.23" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 444ec87f..cc369814 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -1,4 +1,5 @@ use coordinator::CoordinatorEvent; +use dora_core::message::uhlc::HLC; use dora_core::{ config::{DataId, InputMapping, NodeId}, coordinator_messages::DaemonEvent, @@ -8,7 +9,6 @@ use dora_core::{ }, descriptor::{CoreNodeKind, Descriptor, ResolvedNode}, }; -use dora_message::uhlc::HLC; use eyre::{bail, eyre, Context, ContextCompat}; use futures::{future, stream, FutureExt, TryFutureExt}; use futures_concurrency::stream::Merge; @@ -348,7 +348,7 @@ impl Daemon { let event = DoraEvent::Timer { dataflow_id, interval, - metadata: dora_message::Metadata::from_parameters( + metadata: dora_core::message::Metadata::from_parameters( hlc.new_timestamp(), Default::default(), ), @@ -784,7 +784,7 @@ pub enum ShmemHandlerEvent { dataflow_id: DataflowId, node_id: NodeId, output_id: DataId, - metadata: dora_message::Metadata<'static>, + metadata: dora_core::message::Metadata<'static>, data: Option>, }, HandlerError(eyre::ErrReport), @@ -819,7 +819,7 @@ pub enum DoraEvent { Timer { dataflow_id: DataflowId, interval: Duration, - metadata: dora_message::Metadata<'static>, + metadata: dora_core::message::Metadata<'static>, }, SpawnedNodeResult { dataflow_id: DataflowId, diff --git a/binaries/daemon/src/shared_mem_handler.rs b/binaries/daemon/src/shared_mem_handler.rs index a9707b25..2f6c3291 100644 --- a/binaries/daemon/src/shared_mem_handler.rs +++ b/binaries/daemon/src/shared_mem_handler.rs @@ -258,7 +258,7 @@ pub enum NodeEvent { dataflow_id: DataflowId, node_id: NodeId, output_id: DataId, - metadata: dora_message::Metadata<'static>, + metadata: dora_core::message::Metadata<'static>, data_len: usize, reply_sender: oneshot::Sender, }, @@ -294,7 +294,7 @@ struct PreparedMessage { dataflow_id: DataflowId, node_id: NodeId, output_id: DataId, - metadata: dora_message::Metadata<'static>, + metadata: dora_core::message::Metadata<'static>, data: Option<(ShmemHandle, usize)>, } diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index b916ad82..3f358d90 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -27,7 +27,6 @@ tokio = { version = "1.17.0", features = ["full"] } tokio-stream = "0.1.8" # pyo3-abi3 flag allow simpler linking. See: https://pyo3.rs/v0.13.2/building_and_distribution.html pyo3 = { version = "0.16", features = ["eyre", "abi3-py37"] } -dora-message = { path = "../../libraries/message" } tracing = "0.1.36" tracing-subscriber = "0.3.15" dora-download = { path = "../../libraries/extensions/download" } diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 4abcc3ec..45be61ed 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -141,8 +141,8 @@ async fn run( OperatorEvent::Panic(payload) => std::panic::resume_unwind(payload), OperatorEvent::Finished { reason } => { if let StopReason::ExplicitStopAll = reason { - let hlc = dora_message::uhlc::HLC::default(); - let metadata = dora_message::Metadata::new(hlc.new_timestamp()); + let hlc = dora_core::message::uhlc::HLC::default(); + let metadata = dora_core::message::Metadata::new(hlc.new_timestamp()); let data = metadata .serialize() .wrap_err("failed to serialize stop message")?; @@ -254,7 +254,7 @@ enum Event { Stop, Input { id: dora_core::config::DataId, - metadata: dora_message::Metadata<'static>, + metadata: dora_core::message::Metadata<'static>, data: Option>, }, InputClosed(dora_core::config::DataId), diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index e48d2ace..8103e3c6 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -1,8 +1,8 @@ use dora_core::{ config::{DataId, NodeId}, descriptor::{OperatorDefinition, OperatorSource}, + message::{Metadata, MetadataParameters}, }; -use dora_message::{Metadata, MetadataParameters}; use eyre::Context; #[cfg(feature = "tracing")] use opentelemetry::sdk::trace::Tracer; diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index a96517dc..786f5d80 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -4,6 +4,8 @@ use std::{ path::Path, }; +pub use dora_message as message; + pub mod config; pub mod coordinator_messages; pub mod daemon_messages; From e521a2bc50b7f1c9b1411d3dec26f1702cc5cdf6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 15:37:39 +0100 Subject: [PATCH 164/225] Add ctrlc handler to dora-coordinator --- Cargo.lock | 1 + binaries/coordinator/Cargo.toml | 1 + binaries/coordinator/src/lib.rs | 87 +++++++++++++++++++++++++++------ 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7dd0050f..5ec3dcd8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -960,6 +960,7 @@ dependencies = [ "bincode", "clap 3.2.20", "communication-layer-request-reply", + "ctrlc", "dora-core", "dora-download", "dora-node-api", diff --git a/binaries/coordinator/Cargo.toml b/binaries/coordinator/Cargo.toml index c1ee3e18..c75bc79b 100644 --- a/binaries/coordinator/Cargo.toml +++ b/binaries/coordinator/Cargo.toml @@ -30,3 +30,4 @@ dora-download = { path = "../../libraries/extensions/download" } which = "4.3.0" communication-layer-request-reply = { path = "../../libraries/communication-layer/request-reply" } thiserror = "1.0.37" +ctrlc = "3.2.5" diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index db87a046..c45a977b 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -21,7 +21,7 @@ use std::{ path::{Path, PathBuf}, time::Duration, }; -use tokio::net::TcpStream; +use tokio::{net::TcpStream, sync::mpsc}; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; use uuid::Uuid; @@ -55,6 +55,9 @@ pub async fn run(args: Args) -> eyre::Result<()> { } async fn start(runtime_path: &Path) -> eyre::Result<()> { + let (ctrlc_tx, ctrlc_rx) = set_up_ctrlc_handler()?; + let mut ctrlc_tx_handle = Some(ctrlc_tx); + let listener = listener::create_listener(DORA_COORDINATOR_PORT_DEFAULT).await?; let (new_daemon_connections, new_daemon_connections_abort) = futures::stream::abortable(TcpListenerStream::new(listener).map(|c| { @@ -76,12 +79,14 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { let daemon_watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval(Duration::from_secs(1))) .map(|_| Event::DaemonWatchdogInterval); + let ctrlc_events = ReceiverStream::new(ctrlc_rx); let mut events = ( new_daemon_connections, daemon_events, control_events, daemon_watchdog_interval, + ctrlc_events, ) .merge(); @@ -249,20 +254,15 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { ControlRequest::Destroy => { tracing::info!("Received destroy command"); - control_events_abort.abort(); - - // stop all running dataflows - for &uuid in running_dataflows.keys() { - stop_dataflow(&running_dataflows, uuid, &mut daemon_connections) - .await?; - } - - // destroy all connected daemons - destroy_daemons(&mut daemon_connections).await?; - - // prevent the creation of new daemon connections - new_daemon_connections_abort.abort(); - daemon_events_tx = None; + handle_destroy( + &control_events_abort, + &running_dataflows, + &mut daemon_connections, + &new_daemon_connections_abort, + &mut daemon_events_tx, + &mut ctrlc_tx_handle, + ) + .await?; b"ok".as_slice().into() } @@ -313,6 +313,18 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { } } } + Event::CtrlC => { + tracing::info!("Destroying coordinator after receiving Ctrl-C signal"); + handle_destroy( + &control_events_abort, + &running_dataflows, + &mut daemon_connections, + &new_daemon_connections_abort, + &mut daemon_events_tx, + &mut ctrlc_tx_handle, + ) + .await?; + } } } @@ -321,6 +333,49 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { Ok(()) } +fn set_up_ctrlc_handler() -> Result<(mpsc::Sender, mpsc::Receiver), eyre::ErrReport> { + let (ctrlc_tx, ctrlc_rx) = mpsc::channel(1); + + let ctrlc_tx_weak = ctrlc_tx.downgrade(); + let mut ctrlc_sent = false; + ctrlc::set_handler(move || { + if ctrlc_sent { + tracing::warn!("received second ctrlc signal -> aborting immediately"); + std::process::abort(); + } else { + tracing::info!("received ctrlc signal"); + if let Some(ctrlc_tx) = ctrlc_tx_weak.upgrade() { + if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { + tracing::error!("failed to report ctrl-c event to dora-coordinator"); + } + } + ctrlc_sent = true; + } + }) + .wrap_err("failed to set ctrl-c handler")?; + + Ok((ctrlc_tx, ctrlc_rx)) +} + +async fn handle_destroy( + control_events_abort: &futures::stream::AbortHandle, + running_dataflows: &HashMap, + daemon_connections: &mut HashMap, + new_daemon_connections_abort: &futures::stream::AbortHandle, + daemon_events_tx: &mut Option>, + ctrlc_tx: &mut Option>, +) -> Result<(), eyre::ErrReport> { + control_events_abort.abort(); + for &uuid in running_dataflows.keys() { + stop_dataflow(running_dataflows, uuid, daemon_connections).await?; + } + destroy_daemons(daemon_connections).await?; + new_daemon_connections_abort.abort(); + *daemon_events_tx = None; + *ctrlc_tx = None; + Ok(()) +} + async fn send_watchdog_message(connection: &mut TcpStream) -> eyre::Result<()> { let message = serde_json::to_vec(&DaemonCoordinatorEvent::Watchdog).unwrap(); @@ -447,7 +502,9 @@ pub enum Event { Control(ControlEvent), Daemon(DaemonEvent), DaemonWatchdogInterval, + CtrlC, } + impl Event { /// Whether this event should be logged. #[allow(clippy::match_like_matches_macro)] From f0242299b7120083eca3f6ae39a0763cee4102fe Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 15:45:44 +0100 Subject: [PATCH 165/225] Simplify abortion of event streams on destroy --- binaries/coordinator/src/lib.rs | 66 ++++++++++++--------------------- 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index c45a977b..cf9a617a 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -13,7 +13,7 @@ use dora_core::{ }, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; -use futures::StreamExt; +use futures::{Stream, StreamExt}; use futures_concurrency::stream::Merge; use run::SpawnedDataflow; use std::{ @@ -55,40 +55,32 @@ pub async fn run(args: Args) -> eyre::Result<()> { } async fn start(runtime_path: &Path) -> eyre::Result<()> { - let (ctrlc_tx, ctrlc_rx) = set_up_ctrlc_handler()?; - let mut ctrlc_tx_handle = Some(ctrlc_tx); + let ctrlc_events = set_up_ctrlc_handler()?; let listener = listener::create_listener(DORA_COORDINATOR_PORT_DEFAULT).await?; - let (new_daemon_connections, new_daemon_connections_abort) = - futures::stream::abortable(TcpListenerStream::new(listener).map(|c| { - c.map(Event::NewDaemonConnection) - .wrap_err("failed to open connection") - .unwrap_or_else(Event::DaemonConnectError) - })); + let new_daemon_connections = TcpListenerStream::new(listener).map(|c| { + c.map(Event::NewDaemonConnection) + .wrap_err("failed to open connection") + .unwrap_or_else(Event::DaemonConnectError) + }); let (daemon_events_tx, daemon_events) = tokio::sync::mpsc::channel(2); let mut daemon_events_tx = Some(daemon_events_tx); let daemon_events = ReceiverStream::new(daemon_events); - let (control_events, control_events_abort) = futures::stream::abortable( - control::control_events(control_socket_addr()) - .await - .wrap_err("failed to create control events")?, - ); + let control_events = control::control_events(control_socket_addr()) + .await + .wrap_err("failed to create control events")?; let daemon_watchdog_interval = tokio_stream::wrappers::IntervalStream::new(tokio::time::interval(Duration::from_secs(1))) .map(|_| Event::DaemonWatchdogInterval); - let ctrlc_events = ReceiverStream::new(ctrlc_rx); - let mut events = ( - new_daemon_connections, - daemon_events, - control_events, - daemon_watchdog_interval, - ctrlc_events, - ) - .merge(); + // events that should be aborted on `dora destroy` + let (abortable_events, abort_handle) = + futures::stream::abortable((control_events, new_daemon_connections, ctrlc_events).merge()); + + let mut events = (abortable_events, daemon_events, daemon_watchdog_interval).merge(); let mut running_dataflows: HashMap = HashMap::new(); let mut daemon_connections: HashMap<_, TcpStream> = HashMap::new(); @@ -255,12 +247,10 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { tracing::info!("Received destroy command"); handle_destroy( - &control_events_abort, &running_dataflows, &mut daemon_connections, - &new_daemon_connections_abort, + &abort_handle, &mut daemon_events_tx, - &mut ctrlc_tx_handle, ) .await?; @@ -316,12 +306,10 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { Event::CtrlC => { tracing::info!("Destroying coordinator after receiving Ctrl-C signal"); handle_destroy( - &control_events_abort, &running_dataflows, &mut daemon_connections, - &new_daemon_connections_abort, + &abort_handle, &mut daemon_events_tx, - &mut ctrlc_tx_handle, ) .await?; } @@ -333,10 +321,9 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { Ok(()) } -fn set_up_ctrlc_handler() -> Result<(mpsc::Sender, mpsc::Receiver), eyre::ErrReport> { +fn set_up_ctrlc_handler() -> Result, eyre::ErrReport> { let (ctrlc_tx, ctrlc_rx) = mpsc::channel(1); - let ctrlc_tx_weak = ctrlc_tx.downgrade(); let mut ctrlc_sent = false; ctrlc::set_handler(move || { if ctrlc_sent { @@ -344,35 +331,30 @@ fn set_up_ctrlc_handler() -> Result<(mpsc::Sender, mpsc::Receiver) std::process::abort(); } else { tracing::info!("received ctrlc signal"); - if let Some(ctrlc_tx) = ctrlc_tx_weak.upgrade() { - if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { - tracing::error!("failed to report ctrl-c event to dora-coordinator"); - } + if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { + tracing::error!("failed to report ctrl-c event to dora-coordinator"); } + ctrlc_sent = true; } }) .wrap_err("failed to set ctrl-c handler")?; - Ok((ctrlc_tx, ctrlc_rx)) + Ok(ReceiverStream::new(ctrlc_rx)) } async fn handle_destroy( - control_events_abort: &futures::stream::AbortHandle, running_dataflows: &HashMap, daemon_connections: &mut HashMap, - new_daemon_connections_abort: &futures::stream::AbortHandle, + abortable_events: &futures::stream::AbortHandle, daemon_events_tx: &mut Option>, - ctrlc_tx: &mut Option>, ) -> Result<(), eyre::ErrReport> { - control_events_abort.abort(); + abortable_events.abort(); for &uuid in running_dataflows.keys() { stop_dataflow(running_dataflows, uuid, daemon_connections).await?; } destroy_daemons(daemon_connections).await?; - new_daemon_connections_abort.abort(); *daemon_events_tx = None; - *ctrlc_tx = None; Ok(()) } From 8c2587e24da84b787e9f15efff639b21daf12dd6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 22 Feb 2023 15:46:14 +0100 Subject: [PATCH 166/225] Fix: abort watchdog timer events on destroy --- binaries/coordinator/src/lib.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index cf9a617a..3af96394 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -77,10 +77,17 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .map(|_| Event::DaemonWatchdogInterval); // events that should be aborted on `dora destroy` - let (abortable_events, abort_handle) = - futures::stream::abortable((control_events, new_daemon_connections, ctrlc_events).merge()); - - let mut events = (abortable_events, daemon_events, daemon_watchdog_interval).merge(); + let (abortable_events, abort_handle) = futures::stream::abortable( + ( + control_events, + new_daemon_connections, + ctrlc_events, + daemon_watchdog_interval, + ) + .merge(), + ); + + let mut events = (abortable_events, daemon_events).merge(); let mut running_dataflows: HashMap = HashMap::new(); let mut daemon_connections: HashMap<_, TcpStream> = HashMap::new(); From 19e87eee7fe6e45f45fd9882df936c1a3c66cae3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 08:38:33 +0100 Subject: [PATCH 167/225] CI: Run `dora-cli up` to start both coordinator and daemon --- .github/workflows/ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e1e40f04..581f1177 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,12 +92,11 @@ jobs: timeout-minutes: 30 run: | cargo install --path binaries/coordinator + cargo install --path binaries/daemon cargo install --path binaries/runtime cargo install --path binaries/cli - - name: "Start dora-coordinator" - run: | - dora-coordinator & + - run: dora-cli up - name: "Test dora `list" run: dora-cli list From 83c8ecf3e949f2ac340d04d1bccdd12c2fca1c14 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 09:11:14 +0100 Subject: [PATCH 168/225] Change Python operator API: `on_event` instead of `on_input` We now report other event types as well. Right now, the only other event type is `Stop`, but we can extend this in the future. --- binaries/runtime/src/operator/mod.rs | 40 +++++ binaries/runtime/src/operator/python.rs | 144 ++++++++---------- .../object_detection.py | 8 + examples/python-operator-dataflow/plot.py | 8 + 4 files changed, 118 insertions(+), 82 deletions(-) diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index 8103e3c6..ded825df 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -3,9 +3,14 @@ use dora_core::{ descriptor::{OperatorDefinition, OperatorSource}, message::{Metadata, MetadataParameters}, }; +use dora_operator_api_python::metadata_to_pydict; use eyre::Context; #[cfg(feature = "tracing")] use opentelemetry::sdk::trace::Tracer; +use pyo3::{ + types::{PyBytes, PyDict}, + IntoPy, PyObject, Python, +}; use std::any::Any; use tokio::sync::mpsc::{Receiver, Sender}; @@ -95,6 +100,41 @@ pub enum IncomingEvent { }, } +impl IntoPy for IncomingEvent { + fn into_py(self, py: Python) -> PyObject { + let dict = PyDict::new(py); + + let ty = match self { + Self::Stop => "STOP", + Self::Input { + input_id, + metadata, + data, + } => { + dict.set_item("id", input_id.to_string()) + .wrap_err("failed to add input ID") + .unwrap(); + dict.set_item( + "data", + PyBytes::new(py, data.as_deref().unwrap_or_default()), + ) + .wrap_err("failed to add input data") + .unwrap(); + dict.set_item("metadata", metadata_to_pydict(&metadata, py)) + .wrap_err("failed to add input metadata") + .unwrap(); + "INPUT" + } + }; + + dict.set_item("type", ty) + .wrap_err("could not make type a python dictionary item") + .unwrap(); + + dict.into() + } +} + #[derive(Debug)] pub enum StopReason { InputsClosed, diff --git a/binaries/runtime/src/operator/python.rs b/binaries/runtime/src/operator/python.rs index be749d1e..5f8655cc 100644 --- a/binaries/runtime/src/operator/python.rs +++ b/binaries/runtime/src/operator/python.rs @@ -6,15 +6,9 @@ use dora_core::{ descriptor::source_is_url, }; use dora_download::download_file; -use dora_operator_api_python::metadata_to_pydict; use dora_operator_api_types::DoraStatus; use eyre::{bail, eyre, Context, Result}; -use pyo3::{ - pyclass, - types::IntoPyDict, - types::{PyBytes, PyDict}, - Py, Python, -}; +use pyo3::{pyclass, types::IntoPyDict, IntoPy, Py, Python}; use std::{ borrow::Cow, panic::{catch_unwind, AssertUnwindSafe}, @@ -23,15 +17,12 @@ use std::{ use tokio::sync::mpsc::{Receiver, Sender}; fn traceback(err: pyo3::PyErr) -> eyre::Report { - Python::with_gil(|py| { - eyre::Report::msg(format!( - "{}\n{err}", - err.traceback(py) - .expect("PyError should have a traceback") - .format() - .expect("Traceback could not be formatted") - )) - }) + let traceback = Python::with_gil(|py| err.traceback(py).and_then(|t| t.format().ok())); + if let Some(traceback) = traceback { + eyre::eyre!("{err}:\n{traceback}") + } else { + eyre::eyre!("{err}") + } } #[tracing::instrument(skip(events_tx, incoming_events, tracer))] @@ -109,72 +100,61 @@ pub fn run( Python::with_gil(init_operator).wrap_err("failed to init python operator")?; let reason = loop { - let Some(event) = incoming_events.blocking_recv() else { break StopReason::InputsClosed }; - - match event { - IncomingEvent::Input { - input_id, - mut metadata, - data, - } => { - #[cfg(feature = "tracing")] - let (_child_cx, string_cx) = { - use dora_tracing::{deserialize_context, serialize_context}; - use opentelemetry::trace::TraceContextExt; - use opentelemetry::{trace::Tracer, Context as OtelContext}; - - let cx = deserialize_context(&metadata.parameters.open_telemetry_context); - let span = tracer.start_with_context(format!("{}", input_id), &cx); - - let child_cx = OtelContext::current_with_span(span); - let string_cx = serialize_context(&child_cx); - (child_cx, string_cx) - }; - - #[cfg(not(feature = "tracing"))] - let string_cx = { - let () = tracer; - "".to_string() - }; - metadata.parameters.open_telemetry_context = Cow::Owned(string_cx); - - let status = Python::with_gil(|py| -> Result { - // We need to create a new scoped `GILPool` because the dora-runtime - // is currently started through a `start_runtime` wrapper function, - // which is annotated with `#[pyfunction]`. This attribute creates an - // initial `GILPool` that lasts for the entire lifetime of the `dora-runtime`. - // However, we want the `PyBytes` created below to be freed earlier. - // creating a new scoped `GILPool` tied to this closure, will free `PyBytes` - // at the end of the closure. - // See https://github.com/PyO3/pyo3/pull/2864 and - // https://github.com/PyO3/pyo3/issues/2853 for more details. - let pool = unsafe { py.new_pool() }; - let py = pool.python(); - let input_dict = PyDict::new(py); - - input_dict.set_item("id", input_id.as_str())?; - if let Some(data) = data { - let bytes = PyBytes::new(py, &data); - input_dict.set_item("data", bytes)?; - } - input_dict.set_item("metadata", metadata_to_pydict(&metadata, py))?; - - let status_enum = operator - .call_method1(py, "on_input", (input_dict, send_output.clone())) - .map_err(traceback)?; - let status_val = Python::with_gil(|py| status_enum.getattr(py, "value")) - .wrap_err("on_input must have enum return value")?; - Python::with_gil(|py| status_val.extract(py)) - .wrap_err("on_input has invalid return value") - })?; - match status { - s if s == DoraStatus::Continue as i32 => {} // ok - s if s == DoraStatus::Stop as i32 => break StopReason::ExplicitStop, - s if s == DoraStatus::StopAll as i32 => break StopReason::ExplicitStopAll, - other => bail!("on_input returned invalid status {other}"), - } - } - IncomingEvent::Stop => {} + let Some(mut event) = incoming_events.blocking_recv() else { break StopReason::InputsClosed }; + + if let IncomingEvent::Input { + input_id, metadata, .. + } = &mut event + { + #[cfg(feature = "tracing")] + let (_child_cx, string_cx) = { + use dora_tracing::{deserialize_context, serialize_context}; + use opentelemetry::trace::TraceContextExt; + use opentelemetry::{trace::Tracer, Context as OtelContext}; + + let cx = deserialize_context(&metadata.parameters.open_telemetry_context); + let span = tracer.start_with_context(format!("{}", input_id), &cx); + + let child_cx = OtelContext::current_with_span(span); + let string_cx = serialize_context(&child_cx); + (child_cx, string_cx) + }; + + #[cfg(not(feature = "tracing"))] + let string_cx = { + let _ = input_id; + let () = tracer; + "".to_string() + }; + metadata.parameters.open_telemetry_context = Cow::Owned(string_cx); + } + let status = Python::with_gil(|py| -> Result { + // We need to create a new scoped `GILPool` because the dora-runtime + // is currently started through a `start_runtime` wrapper function, + // which is annotated with `#[pyfunction]`. This attribute creates an + // initial `GILPool` that lasts for the entire lifetime of the `dora-runtime`. + // However, we want the `PyBytes` created below to be freed earlier. + // creating a new scoped `GILPool` tied to this closure, will free `PyBytes` + // at the end of the closure. + // See https://github.com/PyO3/pyo3/pull/2864 and + // https://github.com/PyO3/pyo3/issues/2853 for more details. + let pool = unsafe { py.new_pool() }; + let py = pool.python(); + let input_dict = event.into_py(py); + + let status_enum = operator + .call_method1(py, "on_event", (input_dict, send_output.clone())) + .map_err(traceback)?; + let status_val = Python::with_gil(|py| status_enum.getattr(py, "value")) + .wrap_err("on_event must have enum return value")?; + Python::with_gil(|py| status_val.extract(py)) + .wrap_err("on_event has invalid return value") + })?; + match status { + s if s == DoraStatus::Continue as i32 => {} // ok + s if s == DoraStatus::Stop as i32 => break StopReason::ExplicitStop, + s if s == DoraStatus::StopAll as i32 => break StopReason::ExplicitStopAll, + other => bail!("on_event returned invalid status {other}"), } }; diff --git a/examples/python-operator-dataflow/object_detection.py b/examples/python-operator-dataflow/object_detection.py index 098ec4d1..fd103f86 100755 --- a/examples/python-operator-dataflow/object_detection.py +++ b/examples/python-operator-dataflow/object_detection.py @@ -22,6 +22,14 @@ class Operator: def __init__(self): self.model = torch.hub.load("ultralytics/yolov5", "yolov5n") + def on_event( + self, + dora_event: dict, + send_output: Callable[[str, bytes], None], + ) -> DoraStatus: + if dora_event["type"] == "INPUT": + return self.on_input(dora_event, send_output) + def on_input( self, dora_input: dict, diff --git a/examples/python-operator-dataflow/plot.py b/examples/python-operator-dataflow/plot.py index 57a2a293..6c95eae8 100755 --- a/examples/python-operator-dataflow/plot.py +++ b/examples/python-operator-dataflow/plot.py @@ -26,6 +26,14 @@ class Operator: self.image = [] self.bboxs = [] + def on_event( + self, + dora_event: dict, + send_output: Callable[[str, bytes], None], + ) -> DoraStatus: + if dora_event["type"] == "INPUT": + return self.on_input(dora_event, send_output) + def on_input( self, dora_input: dict, From d69c87e2c7867baa6c31f19ff2f8e8df14385850 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 10:08:09 +0100 Subject: [PATCH 169/225] Fix: override dora dependencies with local path when testing templates --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 581f1177..f786d30c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -109,7 +109,7 @@ jobs: timeout-minutes: 30 run: | cd test_project - cargo build --all + cargo build --all --config "patch.'https://github.com/dora-rs/dora.git'.dora-node-api.path=\"../apis/rust/node\"" --config "patch.'https://github.com/dora-rs/dora.git'.dora-operator-api.path=\"../apis/rust/operator\"" UUID=$(dora-cli start dataflow.yml) sleep 10 dora-cli stop $UUID From 706eb562f1fddf59af9e97e1764556d89ec21be0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 10:15:30 +0100 Subject: [PATCH 170/225] Remove single backtick from step names --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f786d30c..37eb22ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -98,7 +98,7 @@ jobs: - run: dora-cli up - - name: "Test dora `list" + - name: "Test dora list" run: dora-cli list - name: "Test new command" @@ -115,7 +115,7 @@ jobs: dora-cli stop $UUID cd .. - - name: "Test dora `destroy" + - name: "Test dora destroy" run: dora-cli destroy examples-remote: From acf4a1dc52a48fcaa0e7e1bbd80d1ac123d1ef2f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 10:20:15 +0100 Subject: [PATCH 171/225] Fix python operators: Also return a `DoraStatus` for non-input events --- examples/python-operator-dataflow/object_detection.py | 1 + examples/python-operator-dataflow/plot.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/python-operator-dataflow/object_detection.py b/examples/python-operator-dataflow/object_detection.py index fd103f86..15fd5f76 100755 --- a/examples/python-operator-dataflow/object_detection.py +++ b/examples/python-operator-dataflow/object_detection.py @@ -29,6 +29,7 @@ class Operator: ) -> DoraStatus: if dora_event["type"] == "INPUT": return self.on_input(dora_event, send_output) + return DoraStatus.CONTINUE def on_input( self, diff --git a/examples/python-operator-dataflow/plot.py b/examples/python-operator-dataflow/plot.py index 6c95eae8..dc0c4d63 100755 --- a/examples/python-operator-dataflow/plot.py +++ b/examples/python-operator-dataflow/plot.py @@ -33,6 +33,7 @@ class Operator: ) -> DoraStatus: if dora_event["type"] == "INPUT": return self.on_input(dora_event, send_output) + return DoraStatus.CONTINUE def on_input( self, From 8c8f56a36850ac65092a24c22e1f2d3e3da69460 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 10:20:41 +0100 Subject: [PATCH 172/225] Update Python operator template for new `on_event` method --- .../src/template/python/operator/operator-template.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/binaries/cli/src/template/python/operator/operator-template.py b/binaries/cli/src/template/python/operator/operator-template.py index f503c32a..9c3b39ea 100644 --- a/binaries/cli/src/template/python/operator/operator-template.py +++ b/binaries/cli/src/template/python/operator/operator-template.py @@ -12,6 +12,15 @@ class Operator: """Called on initialisation""" pass + def on_event( + self, + dora_event: dict, + send_output: Callable[[str, bytes], None], + ) -> DoraStatus: + if dora_event["type"] == "INPUT": + return self.on_input(dora_event, send_output) + return DoraStatus.CONTINUE + def on_input( self, dora_input: dict, From d334c6f2432450d5bf06af055d49bef7ad543638 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 10:25:47 +0100 Subject: [PATCH 173/225] Make `daemon` module of Rust node API private and reexport symbols --- apis/c++/node/src/lib.rs | 5 +---- apis/c/node/src/lib.rs | 5 +---- apis/python/node/src/lib.rs | 5 +---- apis/rust/node/src/lib.rs | 5 +++-- .../cli/src/template/rust/node/main-template.rs | 16 +++++++++++----- binaries/runtime/src/lib.rs | 8 ++++---- examples/benchmark/sink/src/main.rs | 2 +- examples/rust-dataflow/node/src/main.rs | 2 +- examples/rust-dataflow/sink/src/main.rs | 2 +- 9 files changed, 24 insertions(+), 26 deletions(-) diff --git a/apis/c++/node/src/lib.rs b/apis/c++/node/src/lib.rs index b60f0fda..10aba1ba 100644 --- a/apis/c++/node/src/lib.rs +++ b/apis/c++/node/src/lib.rs @@ -1,7 +1,4 @@ -use dora_node_api::{ - self, - daemon::{Event, EventStream}, -}; +use dora_node_api::{self, Event, EventStream}; use eyre::bail; #[cxx::bridge] diff --git a/apis/c/node/src/lib.rs b/apis/c/node/src/lib.rs index 5f13ab84..7583a697 100644 --- a/apis/c/node/src/lib.rs +++ b/apis/c/node/src/lib.rs @@ -1,9 +1,6 @@ #![deny(unsafe_op_in_unsafe_fn)] -use dora_node_api::{ - daemon::{Event, EventStream}, - DoraNode, -}; +use dora_node_api::{DoraNode, Event, EventStream}; use eyre::Context; use std::{ffi::c_void, ptr, slice}; diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index e083c73d..d50069c6 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -1,9 +1,6 @@ #![allow(clippy::borrow_deref_ref)] // clippy warns about code generated by #[pymethods] -use dora_node_api::{ - daemon::{Event, EventStream}, - DoraNode, -}; +use dora_node_api::{DoraNode, Event, EventStream}; use dora_operator_api_python::{metadata_to_pydict, pydict_to_metadata}; use eyre::{Context, Result}; use pyo3::{ diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 64b3fe20..32a57095 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,6 +1,7 @@ use std::thread::JoinHandle; -use daemon::{ControlChannel, DaemonConnection, EventStream}; +use daemon::{ControlChannel, DaemonConnection}; +pub use daemon::{Event, EventStream}; pub use dora_core; pub use dora_core::message::{uhlc, Metadata, MetadataParameters}; use dora_core::{ @@ -11,7 +12,7 @@ use eyre::WrapErr; pub use flume::Receiver; use shared_memory_server::ShmemConf; -pub mod daemon; +mod daemon; pub struct DoraNode { id: NodeId, diff --git a/binaries/cli/src/template/rust/node/main-template.rs b/binaries/cli/src/template/rust/node/main-template.rs index f1fdfbbb..4ae73e38 100644 --- a/binaries/cli/src/template/rust/node/main-template.rs +++ b/binaries/cli/src/template/rust/node/main-template.rs @@ -2,12 +2,18 @@ use dora_node_api::DoraNode; use std::error::Error; fn main() -> Result<(), Box> { - let mut node = DoraNode::init_from_env()?; - let inputs = node.inputs()?; + let (mut node, mut events) = DoraNode::init_from_env()?; - while let Ok(input) = inputs.recv() { - match input.id.as_str() { - other => eprintln!("Received input `{other}`"), + while let Some(event) = events.recv() { + match event { + Event::Input { + id, + metadata, + data: _, + } => match id.as_str() { + other => eprintln!("Received input `{other}`"), + }, + _ => {} } } diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 45be61ed..a226bda5 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -52,14 +52,14 @@ pub fn main() -> eyre::Result<()> { }); let daemon_events = Box::pin(futures::stream::unfold(daemon_events, |mut stream| async { let event = stream.recv_async().await.map(|event| match event { - dora_node_api::daemon::Event::Stop => Event::Stop, - dora_node_api::daemon::Event::Input { id, metadata, data } => Event::Input { + dora_node_api::Event::Stop => Event::Stop, + dora_node_api::Event::Input { id, metadata, data } => Event::Input { id, metadata, data: data.map(|data| data.to_owned()), }, - dora_node_api::daemon::Event::InputClosed { id } => Event::InputClosed(id), - dora_node_api::daemon::Event::Error(err) => Event::Error(err), + dora_node_api::Event::InputClosed { id } => Event::InputClosed(id), + dora_node_api::Event::Error(err) => Event::Error(err), _ => todo!(), }); event.map(|event| (event, stream)) diff --git a/examples/benchmark/sink/src/main.rs b/examples/benchmark/sink/src/main.rs index 97703852..05772dcb 100644 --- a/examples/benchmark/sink/src/main.rs +++ b/examples/benchmark/sink/src/main.rs @@ -1,4 +1,4 @@ -use dora_node_api::{self, daemon::Event, DoraNode}; +use dora_node_api::{self, DoraNode, Event}; use eyre::Context; use std::time::{Duration, Instant}; use tracing_subscriber::Layer; diff --git a/examples/rust-dataflow/node/src/main.rs b/examples/rust-dataflow/node/src/main.rs index 0bdabcea..c52e4618 100644 --- a/examples/rust-dataflow/node/src/main.rs +++ b/examples/rust-dataflow/node/src/main.rs @@ -1,4 +1,4 @@ -use dora_node_api::{self, daemon::Event, dora_core::config::DataId, DoraNode}; +use dora_node_api::{self, dora_core::config::DataId, DoraNode, Event}; fn main() -> eyre::Result<()> { println!("hello"); diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index ef6f8f14..ab49698d 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -1,4 +1,4 @@ -use dora_node_api::{self, daemon::Event, DoraNode}; +use dora_node_api::{self, DoraNode, Event}; use eyre::ContextCompat; fn main() -> eyre::Result<()> { From 3658b5356d558abfd82d5e716b25fd74d56b4f62 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 10:26:39 +0100 Subject: [PATCH 174/225] Fix import in Rust node template --- binaries/cli/src/template/rust/node/main-template.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/cli/src/template/rust/node/main-template.rs b/binaries/cli/src/template/rust/node/main-template.rs index 4ae73e38..659f706c 100644 --- a/binaries/cli/src/template/rust/node/main-template.rs +++ b/binaries/cli/src/template/rust/node/main-template.rs @@ -1,4 +1,4 @@ -use dora_node_api::DoraNode; +use dora_node_api::{DoraNode, Event}; use std::error::Error; fn main() -> Result<(), Box> { From 62319f1ec19ed90d834717526706268104ecbc0b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 14:37:46 +0100 Subject: [PATCH 175/225] Add support for Rust operators again --- Cargo.lock | 7 + apis/c/operator/operator_types.h | 22 ++- apis/rust/operator/macros/src/lib.rs | 18 +- apis/rust/operator/src/lib.rs | 10 +- apis/rust/operator/src/raw.rs | 33 +++- apis/rust/operator/types/src/lib.rs | 21 ++- binaries/daemon/src/lib.rs | 26 ++- binaries/daemon/src/main.rs | 12 +- binaries/daemon/src/spawn.rs | 5 +- binaries/runtime/src/lib.rs | 29 ++- binaries/runtime/src/operator/mod.rs | 32 ++-- binaries/runtime/src/operator/shared_lib.rs | 199 +++++++++++--------- examples/rust-dataflow/dataflow.yml | 13 +- examples/rust-dataflow/operator/Cargo.toml | 13 ++ examples/rust-dataflow/operator/src/lib.rs | 51 +++++ examples/rust-dataflow/run.rs | 16 +- examples/rust-dataflow/sink/src/main.rs | 14 +- 17 files changed, 358 insertions(+), 163 deletions(-) create mode 100644 examples/rust-dataflow/operator/Cargo.toml create mode 100644 examples/rust-dataflow/operator/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index f8a15eb3..a888e51a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3226,6 +3226,13 @@ dependencies = [ "tokio", ] +[[package]] +name = "rust-dataflow-example-operator" +version = "0.1.3" +dependencies = [ + "dora-operator-api", +] + [[package]] name = "rust-dataflow-example-sink" version = "0.1.3" diff --git a/apis/c/operator/operator_types.h b/apis/c/operator/operator_types.h index d8074338..3fcbd911 100644 --- a/apis/c/operator/operator_types.h +++ b/apis/c/operator/operator_types.h @@ -77,13 +77,13 @@ enum DoraStatus { DoraStatus_t; /** */ -typedef struct OnInputResult { +typedef struct OnEventResult { /** */ DoraResult_t result; /** */ DoraStatus_t status; -} OnInputResult_t; +} OnEventResult_t; /** */ typedef struct Metadata { @@ -103,6 +103,18 @@ typedef struct Input { Metadata_t metadata; } Input_t; + +#include + +/** */ +typedef struct FfiEvent { + /** */ + Input_t * input; + + /** */ + bool stop; +} FfiEvent_t; + /** */ typedef struct Output { /** */ @@ -139,10 +151,10 @@ typedef struct SendOutput { } SendOutput_t; /** */ -typedef struct DoraOnInput { +typedef struct DoraOnEvent { /** */ - OnInputResult_t (*on_input)(Input_t const *, SendOutput_t const *, void *); -} DoraOnInput_t; + OnEventResult_t (*on_event)(FfiEvent_t const *, SendOutput_t const *, void *); +} DoraOnEvent_t; #ifdef __cplusplus diff --git a/apis/rust/operator/macros/src/lib.rs b/apis/rust/operator/macros/src/lib.rs index 1b685856..4a6b08d9 100644 --- a/apis/rust/operator/macros/src/lib.rs +++ b/apis/rust/operator/macros/src/lib.rs @@ -49,26 +49,26 @@ fn register_operator_impl(item: &TokenStream2) -> syn::Result { }; }; - let on_input = quote! { + let on_event = quote! { #[no_mangle] - pub unsafe extern "C" fn dora_on_input( - input: &dora_operator_api::types::Input, + pub unsafe extern "C" fn dora_on_event( + event: &dora_operator_api::types::FfiEvent, send_output: &dora_operator_api::types::SendOutput, operator_context: *mut std::ffi::c_void, - ) -> dora_operator_api::types::OnInputResult { - dora_operator_api::raw::dora_on_input::<#operator_ty>( - input, send_output, operator_context + ) -> dora_operator_api::types::OnEventResult { + dora_operator_api::raw::dora_on_event::<#operator_ty>( + event, send_output, operator_context ) } - const _DORA_ON_INPUT: dora_operator_api::types::DoraOnInput = dora_operator_api::types::DoraOnInput { - on_input: dora_operator_api::types::OnInputFn(dora_on_input), + const _DORA_ON_EVENT: dora_operator_api::types::DoraOnEvent = dora_operator_api::types::DoraOnEvent { + on_event: dora_operator_api::types::OnEventFn(dora_on_event), }; }; Ok(quote! { #init #drop - #on_input + #on_event }) } diff --git a/apis/rust/operator/src/lib.rs b/apis/rust/operator/src/lib.rs index dc3dc8de..e850549b 100644 --- a/apis/rust/operator/src/lib.rs +++ b/apis/rust/operator/src/lib.rs @@ -8,12 +8,16 @@ use types::{Metadata, Output, SendOutput}; pub mod raw; +pub enum Event<'a> { + Input { id: &'a str, data: &'a [u8] }, + Stop, +} + pub trait DoraOperator: Default { #[allow(clippy::result_unit_err)] // we use a () error type only for testing - fn on_input( + fn on_event( &mut self, - id: &str, - data: &[u8], + event: &Event, output_sender: &mut DoraOutputSender, ) -> Result; } diff --git a/apis/rust/operator/src/raw.rs b/apis/rust/operator/src/raw.rs index 79c6e8d2..2fcc3418 100644 --- a/apis/rust/operator/src/raw.rs +++ b/apis/rust/operator/src/raw.rs @@ -1,5 +1,5 @@ -use crate::{DoraOperator, DoraOutputSender, DoraStatus}; -use dora_operator_api_types::{DoraInitResult, DoraResult, Input, OnInputResult, SendOutput}; +use crate::{DoraOperator, DoraOutputSender, DoraStatus, Event}; +use dora_operator_api_types::{DoraInitResult, DoraResult, FfiEvent, OnEventResult, SendOutput}; use std::ffi::c_void; pub type OutputFnRaw = unsafe extern "C" fn( @@ -26,21 +26,36 @@ pub unsafe fn dora_drop_operator(operator_context: *mut c_void) -> DoraResult DoraResult { error: None } } -pub unsafe fn dora_on_input( - input: &Input, +pub unsafe fn dora_on_event( + event: &FfiEvent, send_output: &SendOutput, operator_context: *mut std::ffi::c_void, -) -> OnInputResult { +) -> OnEventResult { let mut output_sender = DoraOutputSender(send_output); let operator: &mut O = unsafe { &mut *operator_context.cast() }; - let data = input.data.as_ref().as_slice(); - match operator.on_input(&input.id, data, &mut output_sender) { - Ok(status) => OnInputResult { + + let event_variant = if let Some(input) = &event.input { + let data = input.data.as_ref().as_slice(); + Event::Input { + id: &input.id, + data, + } + } else if event.stop { + Event::Stop + } else { + // ignore unknown events + return OnEventResult { + result: DoraResult { error: None }, + status: DoraStatus::Continue, + }; + }; + match operator.on_event(&event_variant, &mut output_sender) { + Ok(status) => OnEventResult { result: DoraResult { error: None }, status, }, - Err(error) => OnInputResult { + Err(error) => OnEventResult { result: DoraResult { error: Some(error.into()), }, diff --git a/apis/rust/operator/types/src/lib.rs b/apis/rust/operator/types/src/lib.rs index 312caf85..c10efbac 100644 --- a/apis/rust/operator/types/src/lib.rs +++ b/apis/rust/operator/types/src/lib.rs @@ -37,21 +37,30 @@ pub struct DoraResult { #[derive_ReprC] #[ffi_export] #[repr(C)] -pub struct DoraOnInput { - pub on_input: OnInputFn, +pub struct DoraOnEvent { + pub on_event: OnEventFn, } #[derive_ReprC] #[ffi_export] #[repr(transparent)] -pub struct OnInputFn( +pub struct OnEventFn( pub unsafe extern "C" fn( - input: &Input, + event: &FfiEvent, send_output: &SendOutput, operator_context: *mut std::ffi::c_void, - ) -> OnInputResult, + ) -> OnEventResult, ); +#[derive_ReprC] +#[ffi_export] +#[repr(C)] +#[derive(Debug)] +pub struct FfiEvent { + pub input: Option>, + pub stop: bool, +} + #[derive_ReprC] #[ffi_export] #[repr(C)] @@ -91,7 +100,7 @@ pub struct Output { #[ffi_export] #[repr(C)] #[derive(Debug)] -pub struct OnInputResult { +pub struct OnEventResult { pub result: DoraResult, pub status: DoraStatus, } diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index cc369814..a06b9cca 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -49,19 +49,35 @@ pub struct Daemon { /// used for testing and examples exit_when_done: Option>, + + dora_runtime_path: Option, } impl Daemon { - pub async fn run(coordinator_addr: SocketAddr, machine_id: String) -> eyre::Result<()> { + pub async fn run( + coordinator_addr: SocketAddr, + machine_id: String, + dora_runtime_path: Option, + ) -> eyre::Result<()> { // connect to the coordinator let coordinator_events = coordinator::register(coordinator_addr, machine_id.clone()) .await .wrap_err("failed to connect to dora-coordinator")? .map(Event::Coordinator); - Self::run_general(coordinator_events, Some(coordinator_addr), machine_id, None).await + Self::run_general( + coordinator_events, + Some(coordinator_addr), + machine_id, + None, + dora_runtime_path, + ) + .await } - pub async fn run_dataflow(dataflow_path: &Path) -> eyre::Result<()> { + pub async fn run_dataflow( + dataflow_path: &Path, + dora_runtime_path: Option, + ) -> eyre::Result<()> { let working_dir = dataflow_path .canonicalize() .context("failed to canoncialize dataflow path")? @@ -96,6 +112,7 @@ impl Daemon { None, "".into(), Some(exit_when_done), + dora_runtime_path, ); let spawn_result = reply_rx @@ -116,6 +133,7 @@ impl Daemon { coordinator_addr: Option, machine_id: String, exit_when_done: Option>, + dora_runtime_path: Option, ) -> eyre::Result<()> { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let ctrlc_tx = dora_events_tx.clone(); @@ -144,6 +162,7 @@ impl Daemon { coordinator_addr, machine_id, exit_when_done, + dora_runtime_path, }; let (shmem_events_tx, shmem_events_rx) = flume::bounded(5); tokio::spawn(async { @@ -333,6 +352,7 @@ impl Daemon { self.events_tx.clone(), self.shared_memory_handler_node.clone(), daemon_communication_config, + self.dora_runtime_path.as_deref(), ) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`"))?; diff --git a/binaries/daemon/src/main.rs b/binaries/daemon/src/main.rs index dbb8f24d..d7b9e448 100644 --- a/binaries/daemon/src/main.rs +++ b/binaries/daemon/src/main.rs @@ -10,6 +10,9 @@ use tracing_subscriber::Layer; pub struct Args { #[clap(long)] pub run_dataflow: Option, + + #[clap(long)] + pub dora_runtime_path: Option, } #[tokio::main] @@ -22,13 +25,16 @@ async fn main() -> eyre::Result<()> { async fn run() -> eyre::Result<()> { set_up_tracing().wrap_err("failed to set up tracing subscriber")?; - let Args { run_dataflow } = clap::Parser::parse(); + let Args { + run_dataflow, + dora_runtime_path, + } = clap::Parser::parse(); match run_dataflow { Some(dataflow_path) => { tracing::info!("Starting dataflow `{}`", dataflow_path.display()); - Daemon::run_dataflow(&dataflow_path).await + Daemon::run_dataflow(&dataflow_path, dora_runtime_path).await } None => { tracing::info!("Starting in local mode"); @@ -37,7 +43,7 @@ async fn run() -> eyre::Result<()> { let machine_id = String::new(); // TODO - Daemon::run(coordinator_socket.into(), machine_id).await + Daemon::run(coordinator_socket.into(), machine_id, dora_runtime_path).await } } } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index a5f83789..224b0ecf 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -19,6 +19,7 @@ pub async fn spawn_node( daemon_tx: mpsc::Sender, shmem_handler_tx: flume::Sender, config: DaemonCommunicationConfig, + dora_runtime_path: Option<&Path>, ) -> eyre::Result<()> { let node_id = node.id.clone(); tracing::debug!("Spawning node `{dataflow_id}/{node_id}`"); @@ -97,7 +98,9 @@ pub async fn spawn_node( command.args(["-c", "import dora; dora.start_runtime()"]); command } else if !has_python_operator && has_other_operator { - tokio::process::Command::new("dora-runtime") + tokio::process::Command::new( + dora_runtime_path.unwrap_or_else(|| Path::new("dora-runtime")), + ) } else { eyre::bail!("Runtime can not mix Python Operator with other type of operator."); }; diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index a226bda5..cbee3a23 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -95,7 +95,7 @@ pub fn main() -> eyre::Result<()> { .wrap_err_with(|| format!("failed to run operator {operator_id}"))?; match main_task.join() { - Ok(result) => result.wrap_err("Stop loop thread failed unexpectedly.")?, + Ok(result) => result.wrap_err("main task failed")?, Err(panic) => std::panic::resume_unwind(panic), } @@ -126,8 +126,6 @@ async fn run( .map(|(id, config)| (id, config.inputs.keys().collect())) .collect(); - // let mut stopped_operators = BTreeSet::new(); - while let Some(event) = events.next().await { match event { Event::Operator { @@ -168,8 +166,15 @@ async fn run( let result = node.close_outputs(outputs); (node, result) }) - .await?; + .await + .wrap_err("failed to wait for close_outputs task")?; result.wrap_err("failed to close outputs of finished operator")?; + + operator_channels.remove(&operator_id); + + if operator_channels.is_empty() { + break; + } } OperatorEvent::Output { output_id, @@ -184,7 +189,8 @@ async fn run( }); (node, result) }) - .await?; + .await + .wrap_err("failed to wait for send_output task")?; result.wrap_err("failed to send node output")?; } } @@ -207,13 +213,19 @@ async fn run( continue; }; - operator_channel + if let Err(err) = operator_channel .send(operator::IncomingEvent::Input { - input_id, + input_id: input_id.clone(), metadata, data, }) - .await?; + .await + .wrap_err_with(|| { + format!("failed to send input `{input_id}` to operator `{operator_id}`") + }) + { + tracing::warn!("{err}"); + } } Event::InputClosed(id) => { let Some((operator_id, input_id)) = id.as_str().split_once('/') else { @@ -246,6 +258,7 @@ fn operator_output_id(operator_id: &OperatorId, output_id: &DataId) -> DataId { DataId::from(format!("{operator_id}/{output_id}")) } +#[derive(Debug)] enum Event { Operator { id: OperatorId, diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index ded825df..1810b5b7 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -18,7 +18,7 @@ use tokio::sync::mpsc::{Receiver, Sender}; type Tracer = (); mod python; -// mod shared_lib; +mod shared_lib; pub fn run_operator( node_id: &NodeId, @@ -36,22 +36,20 @@ pub fn run_operator( match &operator_definition.config.source { OperatorSource::SharedLibrary(source) => { - // shared_lib::spawn( - // node_id, - // &operator_definition.id, - // source, - // events_tx, - // input_events, - // publishers, - // tracer, - // ) - // .wrap_err_with(|| { - // format!( - // "failed to spawn shared library operator for {}", - // operator_definition.id - // ) - // })?; - todo!() + shared_lib::run( + node_id, + &operator_definition.id, + source, + events_tx, + incoming_events, + tracer, + ) + .wrap_err_with(|| { + format!( + "failed to spawn shared library operator for {}", + operator_definition.id + ) + })?; } OperatorSource::Python(source) => { python::run( diff --git a/binaries/runtime/src/operator/shared_lib.rs b/binaries/runtime/src/operator/shared_lib.rs index e8d40709..0410a8a9 100644 --- a/binaries/runtime/src/operator/shared_lib.rs +++ b/binaries/runtime/src/operator/shared_lib.rs @@ -1,37 +1,33 @@ -use super::{OperatorEvent, StopReason, Tracer}; +use super::{IncomingEvent, OperatorEvent, StopReason, Tracer}; use dora_core::{ adjust_shared_library_path, config::{DataId, NodeId, OperatorId}, descriptor::source_is_url, }; use dora_download::download_file; -use dora_message::uhlc; -use dora_node_api::communication::Publisher; +use dora_node_api::{uhlc, MetadataParameters}; use dora_operator_api_types::{ - safer_ffi::closure::ArcDynFn1, DoraDropOperator, DoraInitOperator, DoraInitResult, DoraOnInput, - DoraResult, DoraStatus, Metadata, OnInputResult, Output, SendOutput, + safer_ffi::closure::ArcDynFn1, DoraDropOperator, DoraInitOperator, DoraInitResult, DoraOnEvent, + DoraResult, DoraStatus, Metadata, OnEventResult, Output, SendOutput, }; use eyre::{bail, eyre, Context}; -use flume::Receiver; use libloading::Symbol; use std::{ - collections::HashMap, + borrow::Cow, ffi::c_void, ops::Deref, panic::{catch_unwind, AssertUnwindSafe}, path::Path, sync::Arc, - thread, }; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::{Receiver, Sender}; -pub fn spawn( +pub fn run( node_id: &NodeId, operator_id: &OperatorId, source: &str, events_tx: Sender, - inputs: Receiver, - publishers: HashMap>, + mut incoming_events: Receiver, tracer: Tracer, ) -> eyre::Result<()> { let path = if source_is_url(source) { @@ -57,47 +53,43 @@ pub fn spawn( }; let hlc = uhlc::HLC::default(); - thread::spawn(move || { - let closure = AssertUnwindSafe(|| { - let bindings = Bindings::init(&library).context("failed to init operator")?; + let closure = AssertUnwindSafe(|| { + let bindings = Bindings::init(&library).context("failed to init operator")?; - let operator = SharedLibraryOperator { - inputs, - bindings, - hlc, - }; + let operator = SharedLibraryOperator { + incoming_events, + bindings, + hlc, + events_tx: events_tx.clone(), + }; - operator.run(publishers, tracer) - }); - match catch_unwind(closure) { - Ok(Ok(reason)) => { - let _ = events_tx.blocking_send(OperatorEvent::Finished { reason }); - } - Ok(Err(err)) => { - let _ = events_tx.blocking_send(OperatorEvent::Error(err)); - } - Err(panic) => { - let _ = events_tx.blocking_send(OperatorEvent::Panic(panic)); - } - } + operator.run(tracer) }); + match catch_unwind(closure) { + Ok(Ok(reason)) => { + let _ = events_tx.blocking_send(OperatorEvent::Finished { reason }); + } + Ok(Err(err)) => { + let _ = events_tx.blocking_send(OperatorEvent::Error(err)); + } + Err(panic) => { + let _ = events_tx.blocking_send(OperatorEvent::Panic(panic)); + } + } Ok(()) } struct SharedLibraryOperator<'lib> { - inputs: Receiver, + incoming_events: Receiver, + events_tx: Sender, bindings: Bindings<'lib>, hlc: uhlc::HLC, } impl<'lib> SharedLibraryOperator<'lib> { - fn run( - self, - publishers: HashMap>, - tracer: Tracer, - ) -> eyre::Result { + fn run(mut self, tracer: Tracer) -> eyre::Result { let operator_context = { let DoraInitResult { result, @@ -115,32 +107,27 @@ impl<'lib> SharedLibraryOperator<'lib> { let send_output_closure = Arc::new(move |output: Output| { let Output { - id, + id: output_id, data, metadata: Metadata { open_telemetry_context, }, } = output; - let mut metadata = dora_node_api::Metadata::new(self.hlc.new_timestamp()); - metadata.parameters.open_telemetry_context = - String::from(open_telemetry_context).into(); - - let message = metadata - .serialize() - .context(format!("failed to serialize `{}` metadata", id.deref())) - .map_err(|err| err.into()); - - let result = message.and_then(|mut message| match publishers.get(id.deref()) { - Some(publisher) => { - message.extend_from_slice(&data); // TODO avoid copy - publisher.publish(&message) - } - None => Err(eyre!( - "unexpected output {} (not defined in dataflow config)", - id.deref() - ) - .into()), - }); + let metadata = MetadataParameters { + open_telemetry_context: Cow::Owned(open_telemetry_context.into()), + ..Default::default() + }; + + let event = OperatorEvent::Output { + output_id: DataId::from(String::from(output_id)), + metadata, + data: data.to_owned(), + }; + + let result = self + .events_tx + .blocking_send(event) + .map_err(|_| eyre!("failed to send output to runtime")); let error = match result { Ok(()) => None, @@ -151,47 +138,75 @@ impl<'lib> SharedLibraryOperator<'lib> { }); let reason = loop { - let Ok(input) = self.inputs.recv() else { + let Some(mut event) = self.incoming_events.blocking_recv() else { break StopReason::InputsClosed }; - #[cfg(feature = "tracing")] - let (_child_cx, string_cx) = { - use dora_tracing::{deserialize_context, serialize_context}; - use opentelemetry::{ - trace::{TraceContextExt, Tracer}, - Context as OtelContext, + + if let IncomingEvent::Input { + input_id, metadata, .. + } = &mut event + { + #[cfg(feature = "tracing")] + let (_child_cx, string_cx) = { + use dora_tracing::{deserialize_context, serialize_context}; + use opentelemetry::{ + trace::{TraceContextExt, Tracer}, + Context as OtelContext, + }; + + let span = tracer.start_with_context( + format!("{}", input.id), + &deserialize_context(&input.metadata.parameters.open_telemetry_context), + ); + let child_cx = OtelContext::current_with_span(span); + let string_cx = serialize_context(&child_cx); + (child_cx, string_cx) }; + #[cfg(not(feature = "tracing"))] + let string_cx = { + let () = tracer; + "".to_string() + }; + metadata.parameters.open_telemetry_context = Cow::Owned(string_cx); + } - let span = tracer.start_with_context( - format!("{}", input.id), - &deserialize_context(&input.metadata.parameters.open_telemetry_context), - ); - let child_cx = OtelContext::current_with_span(span); - let string_cx = serialize_context(&child_cx); - (child_cx, string_cx) - }; - #[cfg(not(feature = "tracing"))] - let string_cx = { - let () = tracer; - "".to_string() - }; - let operator_input = dora_operator_api_types::Input { - data: input.data().into_owned().into(), - id: String::from(input.id).into(), - metadata: Metadata { - open_telemetry_context: string_cx.into(), + let operator_event = match event { + IncomingEvent::Stop => dora_operator_api_types::FfiEvent { + input: None, + stop: true, }, + IncomingEvent::Input { + input_id, + metadata, + data, + } => { + let operator_input = dora_operator_api_types::Input { + id: String::from(input_id).into(), + data: data.unwrap_or_default().into(), + metadata: Metadata { + open_telemetry_context: metadata + .parameters + .open_telemetry_context + .into_owned() + .into(), + }, + }; + dora_operator_api_types::FfiEvent { + input: Some(Box::new(operator_input).into()), + stop: false, + } + } }; let send_output = SendOutput { send_output: ArcDynFn1::new(send_output_closure.clone()), }; - let OnInputResult { + let OnEventResult { result: DoraResult { error }, status, } = unsafe { - (self.bindings.on_input.on_input)( - &operator_input, + (self.bindings.on_event.on_event)( + &operator_event, &send_output, operator_context.raw, ) @@ -223,7 +238,7 @@ impl<'lib> Drop for OperatorContext<'lib> { struct Bindings<'lib> { init_operator: Symbol<'lib, DoraInitOperator>, drop_operator: Symbol<'lib, DoraDropOperator>, - on_input: Symbol<'lib, DoraOnInput>, + on_event: Symbol<'lib, DoraOnEvent>, } impl<'lib> Bindings<'lib> { @@ -236,9 +251,9 @@ impl<'lib> Bindings<'lib> { drop_operator: library .get(b"dora_drop_operator") .wrap_err("failed to get `dora_drop_operator`")?, - on_input: library - .get(b"dora_on_input") - .wrap_err("failed to get `dora_on_input`")?, + on_event: library + .get(b"dora_on_event") + .wrap_err("failed to get `dora_on_event`")?, } }; Ok(bindings) diff --git a/examples/rust-dataflow/dataflow.yml b/examples/rust-dataflow/dataflow.yml index d3777d97..838ae154 100644 --- a/examples/rust-dataflow/dataflow.yml +++ b/examples/rust-dataflow/dataflow.yml @@ -13,10 +13,19 @@ nodes: tick: dora/timer/millis/10 outputs: - random + - id: runtime-node + operators: + - id: rust-operator + build: cargo build -p rust-dataflow-example-operator + shared-library: ../../target/debug/rust_dataflow_example_operator + inputs: + tick: dora/timer/millis/100 + random: rust-node/random + outputs: + - status - id: rust-sink custom: build: cargo build -p rust-dataflow-example-sink source: ../../target/debug/rust-dataflow-example-sink inputs: - # message: runtime-node/rust-operator/status - message: rust-node/random + message: runtime-node/rust-operator/status diff --git a/examples/rust-dataflow/operator/Cargo.toml b/examples/rust-dataflow/operator/Cargo.toml new file mode 100644 index 00000000..2422f1fa --- /dev/null +++ b/examples/rust-dataflow/operator/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "rust-dataflow-example-operator" +version.workspace = true +edition = "2021" +license = "Apache-2.0" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["cdylib"] + +[dependencies] +dora-operator-api = { path = "../../../apis/rust/operator" } diff --git a/examples/rust-dataflow/operator/src/lib.rs b/examples/rust-dataflow/operator/src/lib.rs new file mode 100644 index 00000000..681213c5 --- /dev/null +++ b/examples/rust-dataflow/operator/src/lib.rs @@ -0,0 +1,51 @@ +#![warn(unsafe_op_in_unsafe_fn)] + +use dora_operator_api::{register_operator, DoraOperator, DoraOutputSender, DoraStatus, Event}; +use std::time::{Duration, Instant}; + +register_operator!(ExampleOperator); + +#[derive(Debug, Default)] +struct ExampleOperator { + ticks: usize, + last_random_at: Option, +} + +impl DoraOperator for ExampleOperator { + fn on_event( + &mut self, + event: &Event, + output_sender: &mut DoraOutputSender, + ) -> Result { + match event { + Event::Input { id, data } => match *id { + "tick" => { + self.ticks += 1; + } + "random" => { + let parsed = { + let data: [u8; 8] = + (*data).try_into().map_err(|_| "unexpected random data")?; + u64::from_le_bytes(data) + }; + let output = format!( + "operator received random value {parsed:#x} after {} ticks", + self.ticks + ); + output_sender.send("status".into(), output.into_bytes())?; + self.last_random_at = Some(Instant::now()); + } + other => eprintln!("ignoring unexpected input {other}"), + }, + Event::Stop => {} + } + + if let Some(last_random_at) = self.last_random_at { + if last_random_at.elapsed() > Duration::from_secs(1) { + // looks like the node sending the random values finished -> exit too + return Ok(DoraStatus::Stop); + } + } + Ok(DoraStatus::Continue) + } +} diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 37f26473..499ac685 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -14,7 +14,10 @@ async fn main() -> eyre::Result<()> { let dataflow = Path::new("dataflow.yml"); build_dataflow(dataflow).await?; - dora_daemon::Daemon::run_dataflow(dataflow).await?; + build_package("dora-runtime").await?; + let dora_runtime_path = Some(root.join("target").join("debug").join("dora-runtime")); + + dora_daemon::Daemon::run_dataflow(dataflow, dora_runtime_path).await?; Ok(()) } @@ -31,6 +34,17 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { Ok(()) } +async fn build_package(package: &str) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("build"); + cmd.arg("--package").arg(package); + if !cmd.status().await?.success() { + bail!("failed to build {package}"); + }; + Ok(()) +} + fn set_up_tracing() -> eyre::Result<()> { use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; diff --git a/examples/rust-dataflow/sink/src/main.rs b/examples/rust-dataflow/sink/src/main.rs index ab49698d..18632c3f 100644 --- a/examples/rust-dataflow/sink/src/main.rs +++ b/examples/rust-dataflow/sink/src/main.rs @@ -1,5 +1,5 @@ use dora_node_api::{self, DoraNode, Event}; -use eyre::ContextCompat; +use eyre::{bail, Context, ContextCompat}; fn main() -> eyre::Result<()> { let (_node, mut events) = DoraNode::init_from_env()?; @@ -13,9 +13,15 @@ fn main() -> eyre::Result<()> { } => match id.as_str() { "message" => { let data = data.wrap_err("no data")?; - let raw = (&data[..]).try_into().unwrap(); - - println!("received data: {:#x}", u64::from_le_bytes(raw)); + let received_string = std::str::from_utf8(&data) + .wrap_err("received message was not utf8-encoded")?; + println!("sink received message: {}", received_string); + if !received_string.starts_with("operator received random value ") { + bail!("unexpected message format (should start with 'operator received random value')") + } + if !received_string.ends_with(" ticks") { + bail!("unexpected message format (should end with 'ticks')") + } } other => eprintln!("Ignoring unexpected input `{other}`"), }, From 7fdbca12c6cbdb576ec72acbc3424c74a988738a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 16:14:02 +0100 Subject: [PATCH 176/225] Fix check errors --- apis/c++/operator/src/lib.rs | 39 +++++++++++++++++++------------ examples/benchmark/run.rs | 2 +- examples/c++-dataflow/run.rs | 2 +- examples/c-dataflow/run.rs | 2 +- examples/rust-dataflow-url/run.rs | 2 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/apis/c++/operator/src/lib.rs b/apis/c++/operator/src/lib.rs index ebf20c49..511b4470 100644 --- a/apis/c++/operator/src/lib.rs +++ b/apis/c++/operator/src/lib.rs @@ -1,7 +1,9 @@ #![cfg(not(test))] #![warn(unsafe_op_in_unsafe_fn)] -use dora_operator_api::{self, register_operator, DoraOperator, DoraOutputSender, DoraStatus}; +use dora_operator_api::{ + self, register_operator, DoraOperator, DoraOutputSender, DoraStatus, Event, +}; use ffi::DoraSendOutputResult; #[cxx::bridge] @@ -64,23 +66,30 @@ impl Default for OperatorWrapper { } impl DoraOperator for OperatorWrapper { - fn on_input( + fn on_event( &mut self, - id: &str, - data: &[u8], + event: &Event, output_sender: &mut DoraOutputSender, ) -> Result { - let operator = self.operator.as_mut().unwrap(); - let mut output_sender = OutputSender(output_sender); - - let result = ffi::on_input(operator, id, data, &mut output_sender); - if result.error.is_empty() { - Ok(match result.stop { - false => DoraStatus::Continue, - true => DoraStatus::Stop, - }) - } else { - Err(result.error) + match event { + Event::Input { id, data } => { + let operator = self.operator.as_mut().unwrap(); + let mut output_sender = OutputSender(output_sender); + + let result = ffi::on_input(operator, id, data, &mut output_sender); + if result.error.is_empty() { + Ok(match result.stop { + false => DoraStatus::Continue, + true => DoraStatus::Stop, + }) + } else { + Err(result.error) + } + } + _ => { + // ignore other events for now + Ok(DoraStatus::Continue) + } } } } diff --git a/examples/benchmark/run.rs b/examples/benchmark/run.rs index 37f26473..798b420c 100644 --- a/examples/benchmark/run.rs +++ b/examples/benchmark/run.rs @@ -14,7 +14,7 @@ async fn main() -> eyre::Result<()> { let dataflow = Path::new("dataflow.yml"); build_dataflow(dataflow).await?; - dora_daemon::Daemon::run_dataflow(dataflow).await?; + dora_daemon::Daemon::run_dataflow(dataflow, None).await?; Ok(()) } diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index a6c3c285..3e75823d 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -88,7 +88,7 @@ async fn main() -> eyre::Result<()> { // build_package("dora-runtime").await?; let dataflow = Path::new("dataflow.yml").to_owned(); - dora_daemon::Daemon::run_dataflow(&dataflow).await?; + dora_daemon::Daemon::run_dataflow(&dataflow, None).await?; Ok(()) } diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index b041b773..39290ff0 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -18,7 +18,7 @@ async fn main() -> eyre::Result<()> { build_c_node(root, "sink.c", "c_sink").await?; let dataflow = Path::new("dataflow.yml").to_owned(); - dora_daemon::Daemon::run_dataflow(&dataflow).await?; + dora_daemon::Daemon::run_dataflow(&dataflow, None).await?; Ok(()) } diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index 7d0698a3..3c384b6d 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -10,7 +10,7 @@ async fn main() -> eyre::Result<()> { let dataflow = Path::new("dataflow.yml"); build_dataflow(dataflow).await?; - dora_daemon::Daemon::run_dataflow(dataflow).await?; + dora_daemon::Daemon::run_dataflow(dataflow, None).await?; Ok(()) } From e77804b29372754584e3ed79081c8add09013cce Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 16:48:55 +0100 Subject: [PATCH 177/225] Check for node exit errors when running single dataflow --- binaries/daemon/src/lib.rs | 54 +++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index a06b9cca..6bf39107 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -49,6 +49,8 @@ pub struct Daemon { /// used for testing and examples exit_when_done: Option>, + /// used to record dataflow results when `exit_when_done` is used + dataflow_errors: Vec<(Uuid, NodeId, eyre::Report)>, dora_runtime_path: Option, } @@ -72,6 +74,7 @@ impl Daemon { dora_runtime_path, ) .await + .map(|_| ()) } pub async fn run_dataflow( @@ -124,8 +127,18 @@ impl Daemon { } }); - future::try_join(run_result, spawn_result).await?; - Ok(()) + let (dataflow_errors, _) = future::try_join(run_result, spawn_result).await?; + + if dataflow_errors.is_empty() { + Ok(()) + } else { + let mut output = "some nodes failed:".to_owned(); + for (dataflow, node, error) in dataflow_errors { + use std::fmt::Write; + write!(&mut output, "\n - {dataflow}/{node}: {error}").unwrap(); + } + bail!("{output}"); + } } async fn run_general( @@ -134,7 +147,7 @@ impl Daemon { machine_id: String, exit_when_done: Option>, dora_runtime_path: Option, - ) -> eyre::Result<()> { + ) -> eyre::Result> { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let ctrlc_tx = dora_events_tx.clone(); let mut ctrlc_sent = false; @@ -163,6 +176,7 @@ impl Daemon { machine_id, exit_when_done, dora_runtime_path, + dataflow_errors: Vec::new(), }; let (shmem_events_tx, shmem_events_rx) = flume::bounded(5); tokio::spawn(async { @@ -190,7 +204,7 @@ impl Daemon { async fn run_inner( mut self, incoming_events: impl Stream + Unpin, - ) -> eyre::Result<()> { + ) -> eyre::Result> { let mut events = incoming_events; while let Some(event) = events.next().await { @@ -251,7 +265,7 @@ impl Daemon { // } } - Ok(()) + Ok(self.dataflow_errors) } async fn handle_coordinator_event( @@ -519,20 +533,23 @@ impl Daemon { exit_status, } => { let mut signal_exit = false; - match exit_status { + let node_error = match exit_status { NodeExitStatus::Success => { tracing::info!("node {dataflow_id}/{node_id} finished successfully"); + None } NodeExitStatus::IoError(err) => { let err = eyre!(err).wrap_err(format!( "I/O error while waiting for node `{dataflow_id}/{node_id}`" )); - tracing::error!("{err:?}",); + tracing::error!("{err:?}"); + Some(err) } NodeExitStatus::ExitCode(code) => { - tracing::warn!( - "node {dataflow_id}/{node_id} finished with exit code {code}" - ); + let err = + eyre!("node {dataflow_id}/{node_id} finished with exit code {code}"); + tracing::warn!("{err}"); + Some(err) } NodeExitStatus::Signal(signal) => { signal_exit = true; @@ -553,16 +570,19 @@ impl Daemon { other => other.to_string().into(), }; - tracing::warn!( + let err = eyre!( "node {dataflow_id}/{node_id} finished because of signal `{signal}`" ); + tracing::warn!("{err}"); + Some(err) } NodeExitStatus::Unknown => { - tracing::warn!( - "node {dataflow_id}/{node_id} finished with unknown exit code" - ); + let err = + eyre!("node {dataflow_id}/{node_id} finished with unknown exit code"); + tracing::warn!("{err}"); + Some(err) } - } + }; if self .running @@ -579,6 +599,10 @@ impl Daemon { } if let Some(exit_when_done) = &mut self.exit_when_done { + if let Some(err) = node_error { + self.dataflow_errors + .push((dataflow_id, node_id.clone(), err)); + } exit_when_done.remove(&(dataflow_id, node_id)); if exit_when_done.is_empty() { tracing::info!( From cdc0be1206cede524659eddf0ab358722e88c6ae Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 17:13:45 +0100 Subject: [PATCH 178/225] Rename `FfiEvent` to `RawEvent` --- apis/c/operator/operator_types.h | 6 +++--- apis/rust/operator/macros/src/lib.rs | 2 +- apis/rust/operator/src/raw.rs | 4 ++-- apis/rust/operator/types/src/lib.rs | 4 ++-- binaries/runtime/src/operator/shared_lib.rs | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apis/c/operator/operator_types.h b/apis/c/operator/operator_types.h index 3fcbd911..ca95fc2e 100644 --- a/apis/c/operator/operator_types.h +++ b/apis/c/operator/operator_types.h @@ -107,13 +107,13 @@ typedef struct Input { #include /** */ -typedef struct FfiEvent { +typedef struct RawEvent { /** */ Input_t * input; /** */ bool stop; -} FfiEvent_t; +} RawEvent_t; /** */ typedef struct Output { @@ -153,7 +153,7 @@ typedef struct SendOutput { /** */ typedef struct DoraOnEvent { /** */ - OnEventResult_t (*on_event)(FfiEvent_t const *, SendOutput_t const *, void *); + OnEventResult_t (*on_event)(RawEvent_t const *, SendOutput_t const *, void *); } DoraOnEvent_t; diff --git a/apis/rust/operator/macros/src/lib.rs b/apis/rust/operator/macros/src/lib.rs index 4a6b08d9..2af1687d 100644 --- a/apis/rust/operator/macros/src/lib.rs +++ b/apis/rust/operator/macros/src/lib.rs @@ -52,7 +52,7 @@ fn register_operator_impl(item: &TokenStream2) -> syn::Result { let on_event = quote! { #[no_mangle] pub unsafe extern "C" fn dora_on_event( - event: &dora_operator_api::types::FfiEvent, + event: &dora_operator_api::types::RawEvent, send_output: &dora_operator_api::types::SendOutput, operator_context: *mut std::ffi::c_void, ) -> dora_operator_api::types::OnEventResult { diff --git a/apis/rust/operator/src/raw.rs b/apis/rust/operator/src/raw.rs index 2fcc3418..dee87bb6 100644 --- a/apis/rust/operator/src/raw.rs +++ b/apis/rust/operator/src/raw.rs @@ -1,5 +1,5 @@ use crate::{DoraOperator, DoraOutputSender, DoraStatus, Event}; -use dora_operator_api_types::{DoraInitResult, DoraResult, FfiEvent, OnEventResult, SendOutput}; +use dora_operator_api_types::{DoraInitResult, DoraResult, OnEventResult, RawEvent, SendOutput}; use std::ffi::c_void; pub type OutputFnRaw = unsafe extern "C" fn( @@ -27,7 +27,7 @@ pub unsafe fn dora_drop_operator(operator_context: *mut c_void) -> DoraResult } pub unsafe fn dora_on_event( - event: &FfiEvent, + event: &RawEvent, send_output: &SendOutput, operator_context: *mut std::ffi::c_void, ) -> OnEventResult { diff --git a/apis/rust/operator/types/src/lib.rs b/apis/rust/operator/types/src/lib.rs index c10efbac..dc8117a2 100644 --- a/apis/rust/operator/types/src/lib.rs +++ b/apis/rust/operator/types/src/lib.rs @@ -46,7 +46,7 @@ pub struct DoraOnEvent { #[repr(transparent)] pub struct OnEventFn( pub unsafe extern "C" fn( - event: &FfiEvent, + event: &RawEvent, send_output: &SendOutput, operator_context: *mut std::ffi::c_void, ) -> OnEventResult, @@ -56,7 +56,7 @@ pub struct OnEventFn( #[ffi_export] #[repr(C)] #[derive(Debug)] -pub struct FfiEvent { +pub struct RawEvent { pub input: Option>, pub stop: bool, } diff --git a/binaries/runtime/src/operator/shared_lib.rs b/binaries/runtime/src/operator/shared_lib.rs index 0410a8a9..0b6c192f 100644 --- a/binaries/runtime/src/operator/shared_lib.rs +++ b/binaries/runtime/src/operator/shared_lib.rs @@ -171,7 +171,7 @@ impl<'lib> SharedLibraryOperator<'lib> { } let operator_event = match event { - IncomingEvent::Stop => dora_operator_api_types::FfiEvent { + IncomingEvent::Stop => dora_operator_api_types::RawEvent { input: None, stop: true, }, @@ -191,7 +191,7 @@ impl<'lib> SharedLibraryOperator<'lib> { .into(), }, }; - dora_operator_api_types::FfiEvent { + dora_operator_api_types::RawEvent { input: Some(Box::new(operator_input).into()), stop: false, } From 52f8957d43cbed0b91dc031a8e1c563cdcc771e2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 17:15:49 +0100 Subject: [PATCH 179/225] Update operator C header file for new event-based interface --- apis/c/operator/operator_api.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apis/c/operator/operator_api.h b/apis/c/operator/operator_api.h index 4d7414cf..846fd3cb 100644 --- a/apis/c/operator/operator_api.h +++ b/apis/c/operator/operator_api.h @@ -18,8 +18,8 @@ extern "C" EXPORT DoraResult_t dora_drop_operator(void *operator_context); - EXPORT OnInputResult_t dora_on_input( - const Input_t *input, + EXPORT OnEventResult_t dora_on_event( + const RawEvent_t *event, const SendOutput_t *send_output, void *operator_context); @@ -27,7 +27,7 @@ extern "C" { DoraInitOperator_t __dora_init_operator = {.init_operator = dora_init_operator}; DoraDropOperator_t __dora_drop_operator = {.drop_operator = dora_drop_operator}; - DoraOnInput_t __dora_on_input = {.on_input = dora_on_input}; + DoraOnEvent_t __dora_on_event = {.on_event = dora_on_event}; } #ifdef __cplusplus } /* extern \"C\" */ From 11bb9e446a35759bdb38b062bd21b47f22c3e80a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 17:18:37 +0100 Subject: [PATCH 180/225] Update Rust operator template --- .../cli/src/template/rust/operator/lib-template.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/binaries/cli/src/template/rust/operator/lib-template.rs b/binaries/cli/src/template/rust/operator/lib-template.rs index 549dc623..3c6d7470 100644 --- a/binaries/cli/src/template/rust/operator/lib-template.rs +++ b/binaries/cli/src/template/rust/operator/lib-template.rs @@ -8,15 +8,18 @@ struct ExampleOperator { } impl DoraOperator for ExampleOperator { - fn on_input( + fn on_event( &mut self, - id: &str, - data: &[u8], + event: &Event, output_sender: &mut DoraOutputSender, ) -> Result { - match id { - other => eprintln!("Received input {other}"), + match event { + Event::Input { id, data } => match id { + other => eprintln!("Received input {other}"), + }, + _ => {} } + Ok(DoraStatus::Continue) } } From 4a28a0aa829bbc128581ce1ba5a3e668770d008d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 23 Feb 2023 17:22:03 +0100 Subject: [PATCH 181/225] Re-add a Rust-API based operator in C++ example dataflow --- examples/c++-dataflow/dataflow.yml | 18 +++++------ examples/c++-dataflow/run.rs | 48 ++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/examples/c++-dataflow/dataflow.yml b/examples/c++-dataflow/dataflow.yml index 02f1d4c3..21bee9a1 100644 --- a/examples/c++-dataflow/dataflow.yml +++ b/examples/c++-dataflow/dataflow.yml @@ -18,15 +18,15 @@ nodes: # outputs: # - counter - # - id: runtime-node - # operators: - # - id: operator-rust-api - # shared-library: build/operator_rust_api - # inputs: - # counter_1: cxx-node-c-api/counter - # counter_2: cxx-node-rust-api/counter - # outputs: - # - status + - id: runtime-node + operators: + - id: operator-rust-api + shared-library: build/operator_rust_api + inputs: + # counter_1: cxx-node-c-api/counter + counter_2: cxx-node-rust-api/counter + outputs: + - status # - id: operator-c-api # shared-library: build/operator_c_api # inputs: diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index 3e75823d..01435939 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -41,6 +41,22 @@ async fn main() -> eyre::Result<()> { ) .await?; + build_package("dora-operator-api-cxx").await?; + let operator_cxxbridge = target + .join("cxxbridge") + .join("dora-operator-api-cxx") + .join("src"); + tokio::fs::copy( + operator_cxxbridge.join("lib.rs.cc"), + build_dir.join("operator-bridge.cc"), + ) + .await?; + tokio::fs::copy( + operator_cxxbridge.join("lib.rs.h"), + build_dir.join("dora-operator-api.h"), + ) + .await?; + build_package("dora-node-api-c").await?; // build_package("dora-operator-api-c").await?; build_cxx_node( @@ -62,20 +78,20 @@ async fn main() -> eyre::Result<()> { // &["-l", "dora_node_api_c"], // ) // .await?; - // build_cxx_operator( - // &[ - // &dunce::canonicalize(Path::new("operator-rust-api").join("operator.cc"))?, - // &dunce::canonicalize(build_dir.join("operator-bridge.cc"))?, - // ], - // "operator_rust_api", - // &[ - // "-l", - // "dora_operator_api_cxx", - // "-L", - // &root.join("target").join("debug").to_str().unwrap(), - // ], - // ) - // .await?; + build_cxx_operator( + &[ + &dunce::canonicalize(Path::new("operator-rust-api").join("operator.cc"))?, + &dunce::canonicalize(build_dir.join("operator-bridge.cc"))?, + ], + "operator_rust_api", + &[ + "-l", + "dora_operator_api_cxx", + "-L", + root.join("target").join("debug").to_str().unwrap(), + ], + ) + .await?; // build_cxx_operator( // &[&dunce::canonicalize( // Path::new("operator-c-api").join("operator.cc"), @@ -88,7 +104,9 @@ async fn main() -> eyre::Result<()> { // build_package("dora-runtime").await?; let dataflow = Path::new("dataflow.yml").to_owned(); - dora_daemon::Daemon::run_dataflow(&dataflow, None).await?; + build_package("dora-runtime").await?; + let dora_runtime_path = Some(root.join("target").join("debug").join("dora-runtime")); + dora_daemon::Daemon::run_dataflow(&dataflow, dora_runtime_path).await?; Ok(()) } From 8226805eb40212a5b6cbef65231cb15ae2df10f0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 10:53:49 +0100 Subject: [PATCH 182/225] Fix import in Rust operator template --- binaries/cli/src/template/rust/operator/lib-template.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/cli/src/template/rust/operator/lib-template.rs b/binaries/cli/src/template/rust/operator/lib-template.rs index 3c6d7470..b40be985 100644 --- a/binaries/cli/src/template/rust/operator/lib-template.rs +++ b/binaries/cli/src/template/rust/operator/lib-template.rs @@ -1,4 +1,4 @@ -use dora_operator_api::{register_operator, DoraOperator, DoraOutputSender, DoraStatus}; +use dora_operator_api::{register_operator, DoraOperator, DoraOutputSender, DoraStatus, Event}; register_operator!(ExampleOperator); From de786ef101ed59526c6f46117427a6faccb9186b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 10:55:34 +0100 Subject: [PATCH 183/225] Fix some warnings in runtime --- binaries/runtime/src/operator/shared_lib.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/binaries/runtime/src/operator/shared_lib.rs b/binaries/runtime/src/operator/shared_lib.rs index 0b6c192f..c9401466 100644 --- a/binaries/runtime/src/operator/shared_lib.rs +++ b/binaries/runtime/src/operator/shared_lib.rs @@ -5,7 +5,7 @@ use dora_core::{ descriptor::source_is_url, }; use dora_download::download_file; -use dora_node_api::{uhlc, MetadataParameters}; +use dora_node_api::MetadataParameters; use dora_operator_api_types::{ safer_ffi::closure::ArcDynFn1, DoraDropOperator, DoraInitOperator, DoraInitResult, DoraOnEvent, DoraResult, DoraStatus, Metadata, OnEventResult, Output, SendOutput, @@ -15,7 +15,6 @@ use libloading::Symbol; use std::{ borrow::Cow, ffi::c_void, - ops::Deref, panic::{catch_unwind, AssertUnwindSafe}, path::Path, sync::Arc, @@ -27,7 +26,7 @@ pub fn run( operator_id: &OperatorId, source: &str, events_tx: Sender, - mut incoming_events: Receiver, + incoming_events: Receiver, tracer: Tracer, ) -> eyre::Result<()> { let path = if source_is_url(source) { @@ -51,7 +50,6 @@ pub fn run( libloading::Library::new(&path) .wrap_err_with(|| format!("failed to load shared library at `{}`", path.display()))? }; - let hlc = uhlc::HLC::default(); let closure = AssertUnwindSafe(|| { let bindings = Bindings::init(&library).context("failed to init operator")?; @@ -59,7 +57,6 @@ pub fn run( let operator = SharedLibraryOperator { incoming_events, bindings, - hlc, events_tx: events_tx.clone(), }; @@ -85,7 +82,6 @@ struct SharedLibraryOperator<'lib> { events_tx: Sender, bindings: Bindings<'lib>, - hlc: uhlc::HLC, } impl<'lib> SharedLibraryOperator<'lib> { @@ -165,6 +161,7 @@ impl<'lib> SharedLibraryOperator<'lib> { #[cfg(not(feature = "tracing"))] let string_cx = { let () = tracer; + let _ = input_id; "".to_string() }; metadata.parameters.open_telemetry_context = Cow::Owned(string_cx); From aac5a473ced222dec5696ffc5abc3cfcadf7e7d5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 11:26:29 +0100 Subject: [PATCH 184/225] Re-add C operator example --- examples/c-dataflow/dataflow.yml | 12 ++++- examples/c-dataflow/node.c | 8 ++-- examples/c-dataflow/operator.c | 81 ++++++++++++++++++++++++++++++++ examples/c-dataflow/run.rs | 29 +++++++++++- examples/c-dataflow/sink.c | 3 +- 5 files changed, 125 insertions(+), 8 deletions(-) create mode 100644 examples/c-dataflow/operator.c diff --git a/examples/c-dataflow/dataflow.yml b/examples/c-dataflow/dataflow.yml index 472d7094..1cc144fb 100644 --- a/examples/c-dataflow/dataflow.yml +++ b/examples/c-dataflow/dataflow.yml @@ -9,9 +9,17 @@ nodes: inputs: timer: dora/timer/millis/50 outputs: - - counter + - message + - id: runtime-node + operators: + - id: c_operator + shared-library: build/operator + inputs: + message: c_node/message + outputs: + - counter - id: c_sink custom: source: build/c_sink inputs: - counter: c_node/counter + counter: runtime-node/c_operator/counter diff --git a/examples/c-dataflow/node.c b/examples/c-dataflow/node.c index e858baf1..9c39766a 100644 --- a/examples/c-dataflow/node.c +++ b/examples/c-dataflow/node.c @@ -25,7 +25,6 @@ int main() for (char i = 0; i < 100; i++) { - printf("[c node] waiting for next input\n"); void *event = dora_next_event(dora_context); if (event == NULL) { @@ -43,8 +42,11 @@ int main() assert(data_len == 0); - char out_id[] = "counter"; - dora_send_output(dora_context, out_id, strlen(out_id), &i, 1); + char out_id[] = "message"; + char out_data[50]; + int out_data_len = sprintf(out_data, "loop iteration %d", i); + + dora_send_output(dora_context, out_id, strlen(out_id), out_data, out_data_len); } else if (ty == DoraEventType_Stop) { diff --git a/examples/c-dataflow/operator.c b/examples/c-dataflow/operator.c new file mode 100644 index 00000000..ba64e8e3 --- /dev/null +++ b/examples/c-dataflow/operator.c @@ -0,0 +1,81 @@ +#include "../../apis/c/operator/operator_api.h" +#include +#include +#include +#include + +DoraInitResult_t dora_init_operator(void) +{ + void *context = malloc(1); + char *context_char = (char *)context; + *context_char = 0; + + DoraInitResult_t result = {.operator_context = context}; + return result; +} + +DoraResult_t dora_drop_operator(void *operator_context) +{ + free(operator_context); + + DoraResult_t result = {}; + return result; +} + +OnEventResult_t dora_on_event( + const RawEvent_t *event, + const SendOutput_t *send_output, + void *operator_context) +{ + char *counter = (char *)operator_context; + + if (event->input != NULL) + { + // input event + Input_t *input = event->input; + + char id[input->id.len + 1]; + memcpy(id, input->id.ptr, input->id.len); + id[input->id.len] = 0; + + if (strcmp(id, "message") == 0) + { + char data[input->data.len + 1]; + memcpy(data, input->data.ptr, input->data.len); + data[input->data.len] = 0; + + *counter += 1; + printf("C operator received message `%s`, counter: %i\n", data, *counter); + + char *out_id = "counter"; + char *out_id_heap = strdup(out_id); + + int data_alloc_size = 100; + char *out_data = (char *)malloc(data_alloc_size); + int count = snprintf(out_data, data_alloc_size, "The current counter value is %d", *counter); + assert(count >= 0 && count < 100); + + Output_t output = {.id = { + .ptr = (uint8_t *)out_id_heap, + .len = strlen(out_id_heap), + .cap = strlen(out_id_heap) + 1, + }, + .data = {.ptr = (uint8_t *)out_data, .len = strlen(out_data), .cap = data_alloc_size}}; + DoraResult_t res = (send_output->send_output.call)(send_output->send_output.env_ptr, output); + + OnEventResult_t result = {.result = res, .status = DORA_STATUS_CONTINUE}; + return result; + } + else + { + printf("C operator received unexpected input %s, context: %i\n", id, *counter); + } + } + if (event->stop) + { + printf("C operator received stop event\n"); + } + + OnEventResult_t result = {.status = DORA_STATUS_CONTINUE}; + return result; +} diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index 39290ff0..d0bd1ecf 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -17,8 +17,13 @@ async fn main() -> eyre::Result<()> { build_c_node(root, "node.c", "c_node").await?; build_c_node(root, "sink.c", "c_sink").await?; + build_package("dora-operator-api-c").await?; + build_c_operator().await?; + let dataflow = Path::new("dataflow.yml").to_owned(); - dora_daemon::Daemon::run_dataflow(&dataflow, None).await?; + build_package("dora-runtime").await?; + let dora_runtime_path = Some(root.join("target").join("debug").join("dora-runtime")); + dora_daemon::Daemon::run_dataflow(&dataflow, dora_runtime_path).await?; Ok(()) } @@ -97,6 +102,28 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( Ok(()) } +async fn build_c_operator() -> eyre::Result<()> { + let mut compile = tokio::process::Command::new("clang"); + compile.arg("-c").arg("operator.c"); + compile.arg("-o").arg("build/operator.o"); + compile.arg("-fdeclspec"); + #[cfg(unix)] + compile.arg("-fPIC"); + if !compile.status().await?.success() { + bail!("failed to compile c operator"); + }; + + let mut link = tokio::process::Command::new("clang"); + link.arg("-shared").arg("build/operator.o"); + link.arg("-o") + .arg(Path::new("build").join(library_filename("operator"))); + if !link.status().await?.success() { + bail!("failed to link c operator"); + }; + + Ok(()) +} + // taken from `rust_libloading` crate by Simonas Kazlauskas, licensed under the ISC license ( // see https://github.com/nagisa/rust_libloading/blob/master/LICENSE) pub fn library_filename>(name: S) -> OsString { diff --git a/examples/c-dataflow/sink.c b/examples/c-dataflow/sink.c index 3d40894d..d1b89924 100644 --- a/examples/c-dataflow/sink.c +++ b/examples/c-dataflow/sink.c @@ -18,7 +18,6 @@ int main() while (1) { - printf("[c sink] waiting for next input\n"); void *event = dora_next_event(dora_context); if (event == NULL) { @@ -40,7 +39,7 @@ int main() printf("[c sink] received input `"); fwrite(id, id_len, 1, stdout); - printf("` with data: %d\n", *data); + printf("` with data: %s\n", data); } else if (ty == DoraEventType_InputClosed) { From 9c7ce169669ed8a403d46676421da16a81b9329e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 11:51:55 +0100 Subject: [PATCH 185/225] Add examples for C++ nodes and operators based on C-API again --- examples/c++-dataflow/dataflow.yml | 32 +++---- examples/c++-dataflow/node-c-api/main.cc | 85 +++++++++++++++++++ .../c++-dataflow/operator-c-api/operator.cc | 67 +++++++++------ examples/c++-dataflow/run.rs | 38 ++++----- 4 files changed, 159 insertions(+), 63 deletions(-) create mode 100644 examples/c++-dataflow/node-c-api/main.cc diff --git a/examples/c++-dataflow/dataflow.yml b/examples/c++-dataflow/dataflow.yml index 21bee9a1..7b52ae6d 100644 --- a/examples/c++-dataflow/dataflow.yml +++ b/examples/c++-dataflow/dataflow.yml @@ -10,26 +10,28 @@ nodes: tick: dora/timer/millis/300 outputs: - counter - # - id: cxx-node-c-api - # custom: - # source: build/node_c_api - # inputs: - # tick: dora/timer/millis/300 - # outputs: - # - counter + - id: cxx-node-c-api + custom: + source: build/node_c_api + inputs: + tick: dora/timer/millis/300 + outputs: + - counter - - id: runtime-node + - id: runtime-node-1 operators: - id: operator-rust-api shared-library: build/operator_rust_api inputs: - # counter_1: cxx-node-c-api/counter + counter_1: cxx-node-c-api/counter counter_2: cxx-node-rust-api/counter outputs: - status - # - id: operator-c-api - # shared-library: build/operator_c_api - # inputs: - # op_status: runtime-node/operator-rust-api/status - # outputs: - # - half-status + - id: runtime-node-2 + operators: + - id: operator-c-api + shared-library: build/operator_c_api + inputs: + op_status: runtime-node-1/operator-rust-api/status + outputs: + - half-status diff --git a/examples/c++-dataflow/node-c-api/main.cc b/examples/c++-dataflow/node-c-api/main.cc new file mode 100644 index 00000000..8148bf19 --- /dev/null +++ b/examples/c++-dataflow/node-c-api/main.cc @@ -0,0 +1,85 @@ +extern "C" +{ +#include "../../../apis/c/node/node_api.h" +} + +#include +#include + +int run(void *dora_context) +{ + unsigned char counter = 0; + + for (int i = 0; i < 20; i++) + { + void *event = dora_next_event(dora_context); + if (event == NULL) + { + printf("[c node] ERROR: unexpected end of event\n"); + return -1; + } + + enum DoraEventType ty = read_dora_event_type(event); + + if (ty == DoraEventType_Input) + { + counter += 1; + + char *id_ptr; + size_t id_len; + read_dora_input_id(event, &id_ptr, &id_len); + std::string id(id_ptr, id_len); + + char *data_ptr; + size_t data_len; + read_dora_input_data(event, &data_ptr, &data_len); + std::vector data; + for (size_t i = 0; i < data_len; i++) + { + data.push_back(*(data_ptr + i)); + } + + std::cout + << "Received input " + << " (counter: " << (unsigned int)counter << ") data: ["; + for (unsigned char &v : data) + { + std::cout << (unsigned int)v << ", "; + } + std::cout << "]" << std::endl; + + std::vector out_vec{counter}; + std::string out_id = "counter"; + int result = dora_send_output(dora_context, &out_id[0], out_id.length(), (char *)&counter, 1); + if (result != 0) + { + std::cerr << "failed to send output" << std::endl; + return 1; + } + } + else if (ty == DoraEventType_Stop) + { + printf("[c node] received stop event\n"); + } + else + { + printf("[c node] received unexpected event: %d\n", ty); + } + + free_dora_event(event); + } + return 0; +} + +int main() +{ + std::cout << "HELLO FROM C++ (using C API)" << std::endl; + + auto dora_context = init_dora_context_from_env(); + auto ret = run(dora_context); + free_dora_context(dora_context); + + std::cout << "GOODBYE FROM C++ node (using C API)" << std::endl; + + return ret; +} diff --git a/examples/c++-dataflow/operator-c-api/operator.cc b/examples/c++-dataflow/operator-c-api/operator.cc index 33fd17d5..7c9fd299 100644 --- a/examples/c++-dataflow/operator-c-api/operator.cc +++ b/examples/c++-dataflow/operator-c-api/operator.cc @@ -30,44 +30,55 @@ extern "C" DoraResult_t dora_drop_operator(void *operator_context) return {}; } -extern "C" OnInputResult_t dora_on_input( - const Input_t *input, +extern "C" OnEventResult_t dora_on_event( + const RawEvent_t *event, const SendOutput_t *send_output, void *operator_context) { + if (event->input != NULL) + { + // input event + Input_t *input = event->input; + std::string id((char *)input->id.ptr, input->id.len); - std::string id((char *)input->id.ptr, input->id.len); + std::vector data; + for (size_t i = 0; i < input->data.len; i++) + { + data.push_back(*(input->data.ptr + i)); + } - std::vector data; - for (size_t i = 0; i < input->data.len; i++) - { - data.push_back(*(input->data.ptr + i)); - } + std::cout + << "C++ Operator (C-API) received input `" << id << "` with data: ["; + for (unsigned char &v : data) + { + std::cout << (unsigned int)v << ", "; + } + std::cout << "]" << std::endl; - std::cout - << "C++ Operator (C-API) received input `" << id << "` with data: ["; - for (unsigned char &v : data) - { - std::cout << (unsigned int)v << ", "; - } - std::cout << "]" << std::endl; + const char *out_id = "half-status"; + char *out_id_heap = strdup(out_id); - const char *out_id = "half-status"; - char *out_id_heap = strdup(out_id); + size_t out_data_len = 1; + uint8_t *out_data_heap = (uint8_t *)malloc(out_data_len); + *out_data_heap = data[0] / 2; - size_t out_data_len = 1; - uint8_t *out_data_heap = (uint8_t *)malloc(out_data_len); - *out_data_heap = data[0] / 2; + Output_t output = {.id = { + .ptr = (uint8_t *)out_id_heap, + .len = strlen(out_id_heap), + .cap = strlen(out_id_heap) + 1, + }, + .data = {.ptr = out_data_heap, .len = out_data_len, .cap = out_data_len}}; - Output_t output = {.id = { - .ptr = (uint8_t *)out_id_heap, - .len = strlen(out_id_heap), - .cap = strlen(out_id_heap) + 1, - }, - .data = {.ptr = out_data_heap, .len = out_data_len, .cap = out_data_len}}; + DoraResult_t send_result = (send_output->send_output.call)(send_output->send_output.env_ptr, output); - DoraResult_t send_result = (send_output->send_output.call)(send_output->send_output.env_ptr, output); + OnEventResult_t result = {.result = send_result, .status = DORA_STATUS_CONTINUE}; + return result; + } + if (event->stop) + { + printf("C operator received stop event\n"); + } - OnInputResult_t result = {.result = send_result, .status = DORA_STATUS_CONTINUE}; + OnEventResult_t result = {.status = DORA_STATUS_CONTINUE}; return result; } diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index 01435939..2698b6d9 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -58,7 +58,7 @@ async fn main() -> eyre::Result<()> { .await?; build_package("dora-node-api-c").await?; - // build_package("dora-operator-api-c").await?; + build_package("dora-operator-api-c").await?; build_cxx_node( root, &[ @@ -69,15 +69,15 @@ async fn main() -> eyre::Result<()> { &["-l", "dora_node_api_cxx"], ) .await?; - // build_cxx_node( - // root, - // &[&dunce::canonicalize( - // Path::new("node-c-api").join("main.cc"), - // )?], - // "node_c_api", - // &["-l", "dora_node_api_c"], - // ) - // .await?; + build_cxx_node( + root, + &[&dunce::canonicalize( + Path::new("node-c-api").join("main.cc"), + )?], + "node_c_api", + &["-l", "dora_node_api_c"], + ) + .await?; build_cxx_operator( &[ &dunce::canonicalize(Path::new("operator-rust-api").join("operator.cc"))?, @@ -92,16 +92,14 @@ async fn main() -> eyre::Result<()> { ], ) .await?; - // build_cxx_operator( - // &[&dunce::canonicalize( - // Path::new("operator-c-api").join("operator.cc"), - // )?], - // "operator_c_api", - // &[], - // ) - // .await?; - - // build_package("dora-runtime").await?; + build_cxx_operator( + &[&dunce::canonicalize( + Path::new("operator-c-api").join("operator.cc"), + )?], + "operator_c_api", + &[], + ) + .await?; let dataflow = Path::new("dataflow.yml").to_owned(); build_package("dora-runtime").await?; From 821aec223604bd2f0dae72804d44fad97a6475f6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 11:55:00 +0100 Subject: [PATCH 186/225] Improve 'unexpected reply' error messages in coordinator --- binaries/coordinator/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 3af96394..60b67f98 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -379,7 +379,7 @@ async fn send_watchdog_message(connection: &mut TcpStream) -> eyre::Result<()> { .wrap_err("failed to deserialize stop reply from daemon")? { DaemonCoordinatorReply::WatchdogAck => Ok(()), - _ => bail!("unexpected reply"), + other => bail!("unexpected reply after sending `watchdog`: {other:?}"), } } @@ -427,7 +427,7 @@ async fn stop_dataflow( DaemonCoordinatorReply::StopResult(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to stop dataflow")?, - _ => bail!("unexpected reply"), + other => bail!("unexpected reply after sending stop: {other:?}"), } } tracing::info!("successfully stopped dataflow `{uuid}`"); @@ -474,7 +474,7 @@ async fn destroy_daemons(daemon_connections: &mut HashMap) -> DaemonCoordinatorReply::DestroyResult(result) => result .map_err(|e| eyre!(e)) .wrap_err("failed to destroy dataflow")?, - _ => bail!("unexpected reply"), + other => bail!("unexpected reply after sending `destroy`: {other:?}"), } tracing::info!("successfully destroyed daemon `{machine_id}`"); From 2ffae835d526fd60c44349eff7724b068ea17257 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 12:14:06 +0100 Subject: [PATCH 187/225] Fix: send `StopReply` instead of `SpawnReply` after receiving stop message in daemon --- binaries/daemon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 6bf39107..2338c8dd 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -301,7 +301,7 @@ impl Daemon { } Result::<(), eyre::Report>::Ok(()) }; - let reply = DaemonCoordinatorReply::SpawnResult( + let reply = DaemonCoordinatorReply::StopResult( stop.await.map_err(|err| format!("{err:?}")), ); (reply, RunStatus::Continue) From cdcb19fe28dbcb2f87e1e9625b9454d2723c56e9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 12:49:08 +0100 Subject: [PATCH 188/225] Wait a bit before exiting coordinator to ensure that destroy confirmation is sent out --- binaries/coordinator/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 60b67f98..6cee74f4 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -51,6 +51,10 @@ pub async fn run(args: Args) -> eyre::Result<()> { // start in daemon mode start(&runtime_path).await?; + // wait a bit before exiting to allow the background control connection threads to send + // out a destroy confirmation to the CLI (if any) + tokio::time::sleep(Duration::from_secs(1)).await; + Ok(()) } From 019e411eefa15b9d977c32e198d46b291d92031b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 16:08:53 +0100 Subject: [PATCH 189/225] Change CLI control request reply type --- binaries/cli/src/check.rs | 9 ++++++--- binaries/cli/src/main.rs | 33 +++++++++++++++++---------------- libraries/core/src/topics.rs | 23 ++++++++--------------- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/binaries/cli/src/check.rs b/binaries/cli/src/check.rs index 6d164ea8..cd9620bb 100644 --- a/binaries/cli/src/check.rs +++ b/binaries/cli/src/check.rs @@ -3,7 +3,7 @@ use dora_core::{ adjust_shared_library_path, config::{DataId, InputMapping, OperatorId, UserInputMapping}, descriptor::{self, source_is_url, CoreNodeKind, OperatorSource}, - topics::ControlRequest, + topics::{ControlRequest, ControlRequestReply}, }; use eyre::{bail, eyre, Context}; use std::{env::consts::EXE_EXTENSION, io::Write, path::Path}; @@ -66,14 +66,17 @@ pub fn daemon_running() -> Result { .request(&serde_json::to_vec(&ControlRequest::DaemonConnected).unwrap()) .wrap_err("failed to send DaemonConnected message")?; - serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")? + let reply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match reply { + ControlRequestReply::DaemonConnected(running) => running, + other => bail!("unexpected reply to daemon connection check: {other:?}"), + } } Err(_) => { // coordinator is not running false } }; - Ok(running) } diff --git a/binaries/cli/src/main.rs b/binaries/cli/src/main.rs index dec4aeca..cd61d3de 100644 --- a/binaries/cli/src/main.rs +++ b/binaries/cli/src/main.rs @@ -1,9 +1,6 @@ use clap::Parser; use communication_layer_request_reply::{RequestReplyLayer, TcpLayer, TcpRequestReplyConnection}; -use dora_core::topics::{ - control_socket_addr, ControlRequest, DataflowId, ListDataflowResult, StartDataflowResult, - StopDataflowResult, -}; +use dora_core::topics::{control_socket_addr, ControlRequest, ControlRequestReply, DataflowId}; use eyre::{bail, Context}; use std::path::PathBuf; use uuid::Uuid; @@ -169,14 +166,15 @@ fn start_dataflow( ) .wrap_err("failed to send start dataflow message")?; - let result: StartDataflowResult = + let result: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; match result { - StartDataflowResult::Ok { uuid } => { + ControlRequestReply::DataflowStarted { uuid } => { println!("{uuid}"); Ok(()) } - StartDataflowResult::Error(err) => bail!(err), + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), } } @@ -206,11 +204,12 @@ fn stop_dataflow( .unwrap(), ) .wrap_err("failed to send dataflow stop message")?; - let result: StopDataflowResult = + let result: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; match result { - StopDataflowResult::Ok => Ok(()), - StopDataflowResult::Error(err) => bail!(err), + ControlRequestReply::DataflowStopped { uuid: _ } => Ok(()), + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected stop dataflow reply: {other:?}"), } } @@ -221,11 +220,12 @@ fn stop_dataflow_by_name( let reply_raw = control_connection(session)? .request(&serde_json::to_vec(&ControlRequest::StopByName { name }).unwrap()) .wrap_err("failed to send dataflow stop_by_name message")?; - let result: StopDataflowResult = + let result: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; match result { - StopDataflowResult::Ok => Ok(()), - StopDataflowResult::Error(err) => bail!(err), + ControlRequestReply::DataflowStopped { uuid: _ } => Ok(()), + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected stop dataflow reply: {other:?}"), } } @@ -250,11 +250,12 @@ fn query_running_dataflows( let reply_raw = control_connection(session)? .request(&serde_json::to_vec(&ControlRequest::List).unwrap()) .wrap_err("failed to send list message")?; - let reply: ListDataflowResult = + let reply: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; let ids = match reply { - ListDataflowResult::Ok { dataflows } => dataflows, - ListDataflowResult::Error(err) => bail!(err), + ControlRequestReply::DataflowList { dataflows } => dataflows, + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected list dataflow reply: {other:?}"), }; Ok(ids) diff --git a/libraries/core/src/topics.rs b/libraries/core/src/topics.rs index 524f6299..b90bebe1 100644 --- a/libraries/core/src/topics.rs +++ b/libraries/core/src/topics.rs @@ -30,22 +30,15 @@ pub enum ControlRequest { DaemonConnected, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub enum StartDataflowResult { - Ok { uuid: Uuid }, - Error(String), -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub enum StopDataflowResult { - Ok, - Error(String), -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub enum ListDataflowResult { - Ok { dataflows: Vec }, +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub enum ControlRequestReply { Error(String), + CoordinatorStopped, + DataflowStarted { uuid: Uuid }, + DataflowStopped { uuid: Uuid }, + DataflowList { dataflows: Vec }, + DestroyOk, + DaemonConnected(bool), } #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] From 871dd712f02ad78c7c18b7f1928ad7a5c865ab89 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 16:10:31 +0100 Subject: [PATCH 190/225] Make control channel listener async, handle coordinator exit cleanly, update reply type --- binaries/coordinator/src/control.rs | 161 ++++++++++++++++------------ binaries/coordinator/src/lib.rs | 57 ++++------ 2 files changed, 112 insertions(+), 106 deletions(-) diff --git a/binaries/coordinator/src/control.rs b/binaries/coordinator/src/control.rs index e0a0bd5a..60e59fd3 100644 --- a/binaries/coordinator/src/control.rs +++ b/binaries/coordinator/src/control.rs @@ -1,13 +1,19 @@ -use crate::Event; -use communication_layer_request_reply::{ListenConnection, RequestReplyLayer, TcpLayer}; -use dora_core::topics::ControlRequest; -use eyre::Context; -use futures::{Stream, StreamExt}; -use std::{ - io::{self, ErrorKind}, - net::SocketAddr, +use crate::{ + tcp_utils::{tcp_receive, tcp_send}, + Event, +}; +use dora_core::topics::{ControlRequest, ControlRequestReply}; +use eyre::{eyre, Context}; +use futures::{ + future::{self, Either}, + FutureExt, Stream, StreamExt, +}; +use futures_concurrency::future::Race; +use std::{io::ErrorKind, net::SocketAddr}; +use tokio::{ + net::{TcpListener, TcpStream}, + sync::{mpsc, oneshot}, }; -use tokio::sync::{mpsc, oneshot}; use tokio_stream::wrappers::ReceiverStream; pub(crate) async fn control_events( @@ -15,15 +21,14 @@ pub(crate) async fn control_events( ) -> eyre::Result> { let (tx, rx) = mpsc::channel(10); - std::thread::spawn(move || listen(control_listen_addr, tx)); + tokio::spawn(listen(control_listen_addr, tx)); Ok(ReceiverStream::new(rx).map(Event::Control)) } -fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) { - let mut com_layer = TcpLayer::new(); - let result = com_layer - .listen(control_listen_addr) +async fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) { + let result = TcpListener::bind(control_listen_addr) + .await .wrap_err("failed to listen for control messages"); let incoming = match result { Ok(incoming) => incoming, @@ -33,11 +38,20 @@ fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) { } }; - for connection in incoming { + loop { + let new_connection = incoming.accept().map(Either::Left); + let coordinator_stop = tx.closed().map(Either::Right); + let connection = match (new_connection, coordinator_stop).race().await { + future::Either::Left(connection) => connection, + future::Either::Right(()) => { + // coordinator was stopped + break; + } + }; match connection.wrap_err("failed to connect") { - Ok(connection) => { + Ok((connection, _)) => { let tx = tx.clone(); - std::thread::spawn(|| handle_requests(connection, tx)); + tokio::spawn(handle_requests(connection, tx)); } Err(err) => { if tx.blocking_send(err.into()).is_err() { @@ -48,75 +62,88 @@ fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) { } } -fn handle_requests( - mut connection: Box< - dyn ListenConnection, ReplyData = Vec, Error = std::io::Error>, - >, - tx: mpsc::Sender, -) { +async fn handle_requests(mut connection: TcpStream, tx: mpsc::Sender) { loop { - let tx = tx.clone(); - let result = connection.handle_next(Box::new(move |raw| { - let (reply, reply_rx) = oneshot::channel(); - let request = match serde_json::from_slice(&raw) { - Ok(request) => ControlEvent::IncomingRequest { - request, - reply_sender: reply, + let next_request = tcp_receive(&mut connection).map(Either::Left); + let coordinator_stopped = tx.closed().map(Either::Right); + let raw = match (next_request, coordinator_stopped).race().await { + Either::Right(()) => break, + Either::Left(request) => match request { + Ok(message) => message, + Err(err) => match err.kind() { + ErrorKind::UnexpectedEof => { + tracing::trace!("Control connection closed"); + break; + } + err => { + let err = eyre!(err).wrap_err("failed to receive incoming message"); + tracing::error!("{err}"); + break; + } }, - Err(err) => return Err(io::Error::new(ErrorKind::Other, HandlerError::from(err))), + }, + }; + + let result = + match serde_json::from_slice(&raw).wrap_err("failed to deserialize incoming message") { + Ok(request) => handle_request(request, &tx).await, + Err(err) => Err(err), }; - if tx.blocking_send(request).is_err() { - return Err(io::Error::new( - io::ErrorKind::Other, - HandlerError::ServerStopped, - )); - } - let Ok(reply) = reply_rx.blocking_recv() else { - return Err(io::Error::new( - io::ErrorKind::Other, - HandlerError::ServerStopped, - )); + let reply = result.unwrap_or_else(|err| ControlRequestReply::Error(format!("{err}"))); + let serialized = + match serde_json::to_vec(&reply).wrap_err("failed to serialize ControlRequestReply") { + Ok(s) => s, + Err(err) => { + tracing::error!("{err:?}"); + break; + } }; - Ok(reply) - })); - if let Err(err) = result { - match err.kind() { + match tcp_send(&mut connection, &serialized).await { + Ok(()) => {} + Err(err) => match err.kind() { ErrorKind::UnexpectedEof => { - tracing::trace!("Control connection closed"); + tracing::debug!("Control connection closed while trying to send reply"); break; } - ErrorKind::Other => { - let inner = err.into_inner().unwrap(); - let downcasted = inner.downcast_ref().unwrap(); - match downcasted { - HandlerError::ParseError(err) => { - tracing::warn!("failed to parse request: {err}"); - } - HandlerError::ServerStopped => break, - } - } - _ => { - tracing::warn!("I/O error while trying to receive control request: {err:?}"); + err => { + let err = eyre!(err).wrap_err("failed to send reply"); + tracing::error!("{err}"); + break; } - } + }, + } + + if matches!(reply, ControlRequestReply::CoordinatorStopped) { + break; } } } -#[derive(Debug, thiserror::Error)] -enum HandlerError { - #[error("failed to parse request")] - ParseError(#[from] serde_json::Error), - #[error("server was stopped already")] - ServerStopped, +async fn handle_request( + request: ControlRequest, + tx: &mpsc::Sender, +) -> eyre::Result { + let (reply_tx, reply_rx) = oneshot::channel(); + let event = ControlEvent::IncomingRequest { + request, + reply_sender: reply_tx, + }; + + if tx.send(event).await.is_err() { + return Ok(ControlRequestReply::CoordinatorStopped); + } + + reply_rx + .await + .unwrap_or(Ok(ControlRequestReply::CoordinatorStopped)) } #[derive(Debug)] pub enum ControlEvent { IncomingRequest { request: ControlRequest, - reply_sender: oneshot::Sender>, + reply_sender: oneshot::Sender>, }, Error(eyre::Report), } diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 6cee74f4..0eef2142 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -8,8 +8,8 @@ use dora_core::{ coordinator_messages::RegisterResult, daemon_messages::{DaemonCoordinatorEvent, DaemonCoordinatorReply}, topics::{ - control_socket_addr, ControlRequest, DataflowId, ListDataflowResult, StartDataflowResult, - StopDataflowResult, DORA_COORDINATOR_PORT_DEFAULT, + control_socket_addr, ControlRequest, ControlRequestReply, DataflowId, + DORA_COORDINATOR_PORT_DEFAULT, }, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; @@ -51,10 +51,6 @@ pub async fn run(args: Args) -> eyre::Result<()> { // start in daemon mode start(&runtime_path).await?; - // wait a bit before exiting to allow the background control connection threads to send - // out a destroy confirmation to the CLI (if any) - tokio::time::sleep(Duration::from_secs(1)).await; - Ok(()) } @@ -193,18 +189,11 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .await?; Ok(dataflow) }; - let reply = match inner.await { - Ok(dataflow) => { - let uuid = dataflow.uuid; - running_dataflows.insert(uuid, dataflow); - StartDataflowResult::Ok { uuid } - } - Err(err) => { - tracing::error!("{err:?}"); - StartDataflowResult::Error(format!("{err:?}")) - } - }; - serde_json::to_vec(&reply).unwrap() + inner.await.map(|dataflow| { + let uuid = dataflow.uuid; + running_dataflows.insert(uuid, dataflow); + ControlRequestReply::DataflowStarted { uuid } + }) } ControlRequest::Stop { dataflow_uuid } => { let stop = async { @@ -216,12 +205,9 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { .await?; Result::<_, eyre::Report>::Ok(()) }; - let reply = match stop.await { - Ok(()) => StopDataflowResult::Ok, - Err(err) => StopDataflowResult::Error(format!("{err:?}")), - }; - - serde_json::to_vec(&reply).unwrap() + stop.await.map(|()| ControlRequestReply::DataflowStopped { + uuid: dataflow_uuid, + }) } ControlRequest::StopByName { name } => { let stop = async { @@ -245,14 +231,10 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { &mut daemon_connections, ) .await?; - Result::<_, eyre::Report>::Ok(()) - }; - let reply = match stop.await { - Ok(()) => StopDataflowResult::Ok, - Err(err) => StopDataflowResult::Error(format!("{err:?}")), + Result::<_, eyre::Report>::Ok(dataflow_uuid) }; - - serde_json::to_vec(&reply).unwrap() + stop.await + .map(|uuid| ControlRequestReply::DataflowStopped { uuid }) } ControlRequest::Destroy => { tracing::info!("Received destroy command"); @@ -263,15 +245,14 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { &abort_handle, &mut daemon_events_tx, ) - .await?; - - b"ok".as_slice().into() + .await + .map(|()| ControlRequestReply::DestroyOk) } ControlRequest::List => { let mut dataflows: Vec<_> = running_dataflows.values().collect(); dataflows.sort_by_key(|d| (&d.name, d.uuid)); - let reply = ListDataflowResult::Ok { + Ok(ControlRequestReply::DataflowList { dataflows: dataflows .into_iter() .map(|d| DataflowId { @@ -279,13 +260,11 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { name: d.name.clone(), }) .collect(), - }; - - serde_json::to_vec(&reply).unwrap() + }) } ControlRequest::DaemonConnected => { let running = !daemon_connections.is_empty(); - serde_json::to_vec(&running).unwrap() + Ok(ControlRequestReply::DaemonConnected(running)) } }; let _ = reply_sender.send(reply); From a9e9976b6bcc019412cc6477d5d4275b69076728 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 16:11:42 +0100 Subject: [PATCH 191/225] Remove old unneeded code from runtime --- binaries/coordinator/src/run/mod.rs | 25 +---- binaries/coordinator/src/run/runtime.rs | 116 ------------------------ 2 files changed, 2 insertions(+), 139 deletions(-) delete mode 100644 binaries/coordinator/src/run/runtime.rs diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 454cb5c8..efff67a1 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -1,23 +1,19 @@ use crate::tcp_utils::{tcp_receive, tcp_send}; use dora_core::{ - config::{CommunicationConfig, NodeId}, + config::CommunicationConfig, daemon_messages::{DaemonCoordinatorEvent, DaemonCoordinatorReply, SpawnDataflowNodes}, descriptor::{CoreNodeKind, Descriptor}, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; -use futures::{stream::FuturesUnordered, StreamExt}; use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, + collections::{BTreeSet, HashMap}, env::consts::EXE_EXTENSION, path::Path, }; use tokio::net::TcpStream; -use tracing::warn; use uuid::Uuid; -mod runtime; - pub async fn spawn_dataflow( runtime: &Path, dataflow_path: &Path, @@ -116,20 +112,3 @@ async fn read_descriptor(file: &Path) -> Result { serde_yaml::from_slice(&descriptor_file).context("failed to parse given descriptor")?; Ok(descriptor) } - -fn command_init_common_env( - command: &mut tokio::process::Command, - node_id: &NodeId, - communication: &dora_core::config::CommunicationConfig, -) -> Result<(), eyre::Error> { - command.env( - "DORA_NODE_ID", - serde_yaml::to_string(&node_id).wrap_err("failed to serialize custom node ID")?, - ); - command.env( - "DORA_COMMUNICATION_CONFIG", - serde_yaml::to_string(communication) - .wrap_err("failed to serialize communication config")?, - ); - Ok(()) -} diff --git a/binaries/coordinator/src/run/runtime.rs b/binaries/coordinator/src/run/runtime.rs deleted file mode 100644 index 2a3c5541..00000000 --- a/binaries/coordinator/src/run/runtime.rs +++ /dev/null @@ -1,116 +0,0 @@ -use super::command_init_common_env; -use dora_core::{ - config::NodeId, - descriptor::{self, EnvValue, OperatorSource}, -}; -use eyre::{eyre, WrapErr}; -use std::{collections::BTreeMap, path::Path}; - -#[tracing::instrument(skip(node))] -pub fn spawn_runtime_node( - runtime: &Path, - node_id: NodeId, - node: &descriptor::RuntimeNode, - envs: &Option>, - communication: &dora_core::config::CommunicationConfig, - working_dir: &Path, -) -> eyre::Result>> { - let has_python_operator = node - .operators - .iter() - .any(|x| matches!(x.config.source, OperatorSource::Python { .. })); - - let has_other_operator = node - .operators - .iter() - .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); - - let mut command = if has_python_operator && !has_other_operator { - // Use python to spawn runtime if there is a python operator - let mut command = tokio::process::Command::new("python3"); - command.args(["-c", "import dora; dora.start_runtime()"]); - command - } else if !has_python_operator && has_other_operator { - // Use default runtime if there is no python operator - tokio::process::Command::new(runtime) - } else { - return Err(eyre!( - "Runtime can not mix Python Operator with other type of operator." - )); - }; - - command_init_common_env(&mut command, &node_id, communication)?; - command.env( - "DORA_OPERATORS", - serde_yaml::to_string(&node.operators) - .wrap_err("failed to serialize custom node run config")?, - ); - - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = &envs { - for (key, value) in envs { - command.env(key, value.to_string()); - } - } - - command.current_dir(working_dir); - - let mut child = command - .spawn() - .wrap_err_with(|| format!("failed to run runtime at `{}`", runtime.display()))?; - let result = tokio::spawn(async move { - let status = child.wait().await.context("child process failed")?; - if status.success() { - tracing::info!("runtime node {node_id} finished"); - Ok(()) - } else if let Some(code) = status.code() { - if let Some(meaning) = exit_code_meaning(code) { - Err(eyre!( - "runtime node {node_id} failed with exit code: {code}, meaning: {meaning}" - )) - } else { - Err(eyre!( - "runtime node {node_id} failed with exit code: {code} with unknwon meaning." - )) - } - } else { - Err(eyre!("runtime node {node_id} failed (unknown exit code)")) - } - }); - Ok(result) -} - -fn exit_code_meaning(code: i32) -> Option { - if cfg!(unix) { - let meaning = match code { - 0 => "Success", - 1 => "Catchall for general errors", - 2 => "Misuse of shell built-ins", - 64 => "Usage Error", - 65 => "Data Error", - 66 => "No Input", - 67 => "No User", - 68 => "No Host", - 69 => "Service Unavailable", - 70 => "Software Error", - 71 => "OS Error", - 72 => "OS File Error", - 73 => "Cannot Create", - 74 => "IO Error", - 75 => "Temporary Failure", - 76 => "Protocol Error", - 77 => "No Permission", - 78 => "Config Error", - 126 => "Command invoked cannot execute", - 127 => "Command not found", - 128 => "Invalid argument to `exit`", - 256.. => "Exit status out of range", - _ => "Unknown Error code.", - } - .to_string(); - Some(meaning) - } else { - None - } -} From e594a11c5d7d7f610440e1e2c9ad39c7490b8e26 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 24 Feb 2023 17:06:40 +0100 Subject: [PATCH 192/225] Add debug message to example C++ node --- examples/c++-dataflow/node-rust-api/main.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/c++-dataflow/node-rust-api/main.cc b/examples/c++-dataflow/node-rust-api/main.cc index caf11373..31415523 100644 --- a/examples/c++-dataflow/node-rust-api/main.cc +++ b/examples/c++-dataflow/node-rust-api/main.cc @@ -44,5 +44,7 @@ int main() } } + std::cout << "GOODBYE FROM C++ node (using Rust API)" << std::endl; + return 0; } From a9203e027430297ea258147b805e3dc4b6d9295e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 10:27:23 +0100 Subject: [PATCH 193/225] Add intermediate operator channel buffer to avoid blocking runtime --- Cargo.lock | 1 + binaries/runtime/Cargo.toml | 1 + binaries/runtime/src/lib.rs | 8 +- binaries/runtime/src/operator/channel.rs | 92 +++++++++++++++++++++ binaries/runtime/src/operator/mod.rs | 5 +- binaries/runtime/src/operator/python.rs | 6 +- binaries/runtime/src/operator/shared_lib.rs | 10 +-- 7 files changed, 109 insertions(+), 14 deletions(-) create mode 100644 binaries/runtime/src/operator/channel.rs diff --git a/Cargo.lock b/Cargo.lock index a888e51a..3bb2d198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1179,6 +1179,7 @@ dependencies = [ "dora-operator-api-types", "dora-tracing", "eyre", + "flume", "futures", "futures-concurrency", "libloading", diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index 2f972c7d..542877c7 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -30,6 +30,7 @@ pyo3 = { version = "0.16", features = ["eyre", "abi3-py37"] } tracing = "0.1.36" tracing-subscriber = "0.3.15" dora-download = { path = "../../libraries/extensions/download" } +flume = "0.10.14" [features] tracing = ["opentelemetry", "dora-tracing"] diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index cbee3a23..91831f0f 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -71,7 +71,7 @@ pub fn main() -> eyre::Result<()> { .wrap_err("Could not build a tokio runtime.")?; let mut operator_channels = HashMap::new(); - let (operator_channel, incoming_events) = mpsc::channel(10); + let (operator_channel, incoming_events) = operator::channel::channel(tokio_runtime.handle()); operator_channels.insert(operator_definition.id.clone(), operator_channel); tracing::info!("spawning main task"); @@ -107,7 +107,7 @@ async fn run( mut node: DoraNode, operators: HashMap, mut events: impl Stream + Unpin, - mut operator_channels: HashMap>, + mut operator_channels: HashMap>, ) -> eyre::Result<()> { #[cfg(feature = "metrics")] let _started = { @@ -198,7 +198,7 @@ async fn run( Event::Stop => { // forward stop event to all operators and close the event channels for (_, channel) in operator_channels.drain() { - let _ = channel.send(operator::IncomingEvent::Stop).await; + let _ = channel.send_async(operator::IncomingEvent::Stop).await; } } Event::Input { id, metadata, data } => { @@ -214,7 +214,7 @@ async fn run( }; if let Err(err) = operator_channel - .send(operator::IncomingEvent::Input { + .send_async(operator::IncomingEvent::Input { input_id: input_id.clone(), metadata, data, diff --git a/binaries/runtime/src/operator/channel.rs b/binaries/runtime/src/operator/channel.rs new file mode 100644 index 00000000..1cf094fb --- /dev/null +++ b/binaries/runtime/src/operator/channel.rs @@ -0,0 +1,92 @@ +use super::IncomingEvent; +use futures::{ + future::{self, FusedFuture}, + FutureExt, +}; +use std::collections::VecDeque; + +pub fn channel( + runtime: &tokio::runtime::Handle, +) -> (flume::Sender, flume::Receiver) { + let (incoming_tx, incoming_rx) = flume::bounded(10); + let (outgoing_tx, outgoing_rx) = flume::bounded(0); + + runtime.spawn(async { + let mut buffer = InputBuffer::new(); + buffer.run(incoming_rx, outgoing_tx).await; + }); + + (incoming_tx, outgoing_rx) +} + +struct InputBuffer { + queue: VecDeque, +} + +impl InputBuffer { + pub fn new() -> Self { + Self { + queue: VecDeque::new(), + } + } + + pub async fn run( + &mut self, + incoming: flume::Receiver, + outgoing: flume::Sender, + ) { + let mut send_out_buf = future::Fuse::terminated(); + loop { + let next_incoming = incoming.recv_async(); + match future::select(next_incoming, send_out_buf).await { + future::Either::Left((event, mut send_out)) => { + match event { + Ok(event) => { + // received a new event -> push it to the queue + self.queue.push_back(event); + + // TODO: drop oldest events when queue becomes too full + + // if outgoing queue is empty, fill it again + if send_out.is_terminated() { + send_out = self.send_next_queued(&outgoing); + } + } + Err(flume::RecvError::Disconnected) => { + // the incoming channel was closed -> exit if we sent out all events already + if send_out.is_terminated() && self.queue.is_empty() { + break; + } + } + } + + // reassign the send_out future, which might be still in progress + send_out_buf = send_out; + } + future::Either::Right((send_result, _)) => match send_result { + Ok(()) => { + send_out_buf = self.send_next_queued(&outgoing); + } + Err(flume::SendError(_)) => break, + }, + }; + } + } + + fn send_next_queued<'a>( + &mut self, + outgoing: &'a flume::Sender, + ) -> future::Fuse> { + if let Some(next) = self.queue.pop_front() { + outgoing.send_async(next).fuse() + } else { + future::Fuse::terminated() + } + } +} + +impl Default for InputBuffer { + fn default() -> Self { + Self::new() + } +} diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index 1810b5b7..df2548e2 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -12,18 +12,19 @@ use pyo3::{ IntoPy, PyObject, Python, }; use std::any::Any; -use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::mpsc::Sender; #[cfg(not(feature = "tracing"))] type Tracer = (); +pub mod channel; mod python; mod shared_lib; pub fn run_operator( node_id: &NodeId, operator_definition: OperatorDefinition, - incoming_events: Receiver, + incoming_events: flume::Receiver, events_tx: Sender, ) -> eyre::Result<()> { #[cfg(feature = "tracing")] diff --git a/binaries/runtime/src/operator/python.rs b/binaries/runtime/src/operator/python.rs index 5f8655cc..c5406ce5 100644 --- a/binaries/runtime/src/operator/python.rs +++ b/binaries/runtime/src/operator/python.rs @@ -14,7 +14,7 @@ use std::{ panic::{catch_unwind, AssertUnwindSafe}, path::Path, }; -use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::mpsc::Sender; fn traceback(err: pyo3::PyErr) -> eyre::Report { let traceback = Python::with_gil(|py| err.traceback(py).and_then(|t| t.format().ok())); @@ -31,7 +31,7 @@ pub fn run( operator_id: &OperatorId, source: &str, events_tx: Sender, - mut incoming_events: Receiver, + incoming_events: flume::Receiver, tracer: Tracer, ) -> eyre::Result<()> { let path = if source_is_url(source) { @@ -100,7 +100,7 @@ pub fn run( Python::with_gil(init_operator).wrap_err("failed to init python operator")?; let reason = loop { - let Some(mut event) = incoming_events.blocking_recv() else { break StopReason::InputsClosed }; + let Ok(mut event) = incoming_events.recv() else { break StopReason::InputsClosed }; if let IncomingEvent::Input { input_id, metadata, .. diff --git a/binaries/runtime/src/operator/shared_lib.rs b/binaries/runtime/src/operator/shared_lib.rs index c9401466..9e6e5666 100644 --- a/binaries/runtime/src/operator/shared_lib.rs +++ b/binaries/runtime/src/operator/shared_lib.rs @@ -19,14 +19,14 @@ use std::{ path::Path, sync::Arc, }; -use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::mpsc::Sender; pub fn run( node_id: &NodeId, operator_id: &OperatorId, source: &str, events_tx: Sender, - incoming_events: Receiver, + incoming_events: flume::Receiver, tracer: Tracer, ) -> eyre::Result<()> { let path = if source_is_url(source) { @@ -78,14 +78,14 @@ pub fn run( } struct SharedLibraryOperator<'lib> { - incoming_events: Receiver, + incoming_events: flume::Receiver, events_tx: Sender, bindings: Bindings<'lib>, } impl<'lib> SharedLibraryOperator<'lib> { - fn run(mut self, tracer: Tracer) -> eyre::Result { + fn run(self, tracer: Tracer) -> eyre::Result { let operator_context = { let DoraInitResult { result, @@ -134,7 +134,7 @@ impl<'lib> SharedLibraryOperator<'lib> { }); let reason = loop { - let Some(mut event) = self.incoming_events.blocking_recv() else { + let Ok(mut event) = self.incoming_events.recv() else { break StopReason::InputsClosed }; From 0f29d26fe8a573975fe946a047ee95ecb191016e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 11:01:08 +0100 Subject: [PATCH 194/225] Drop oldest operator inputs when queue becomes too full --- binaries/runtime/src/operator/channel.rs | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/binaries/runtime/src/operator/channel.rs b/binaries/runtime/src/operator/channel.rs index 1cf094fb..a125a157 100644 --- a/binaries/runtime/src/operator/channel.rs +++ b/binaries/runtime/src/operator/channel.rs @@ -83,6 +83,36 @@ impl InputBuffer { future::Fuse::terminated() } } + + fn add_event(&mut self, event: IncomingEvent) { + self.queue.push_back(event); + + // drop oldest input events to maintain max queue length queue + let input_event_count = self + .queue + .iter() + .filter(|e| matches!(e, IncomingEvent::Input { .. })) + .count(); + let drop_n = input_event_count.saturating_sub(self.max_queue_len); + if drop_n > 0 { + self.drop_oldest_inputs(drop_n); + } + } + + fn drop_oldest_inputs(&mut self, number: usize) { + tracing::debug!("dropping {number} operator inputs because event queue is too full"); + for i in 0..number { + // find index of oldest input event + let index = self + .queue + .iter() + .position(|e| matches!(e, IncomingEvent::Input { .. })) + .unwrap_or_else(|| panic!("no input event found in drop iteration {i}")); + + // remove that event + self.queue.remove(index); + } + } } impl Default for InputBuffer { From a35850cb0715a72eab72b9750e2ab7c3eecd6564 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 11:01:08 +0100 Subject: [PATCH 195/225] Drop oldest operator inputs when queue becomes too full --- binaries/runtime/src/operator/channel.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/binaries/runtime/src/operator/channel.rs b/binaries/runtime/src/operator/channel.rs index a125a157..d8f1ca27 100644 --- a/binaries/runtime/src/operator/channel.rs +++ b/binaries/runtime/src/operator/channel.rs @@ -21,12 +21,14 @@ pub fn channel( struct InputBuffer { queue: VecDeque, + max_queue_len: usize, } impl InputBuffer { pub fn new() -> Self { Self { queue: VecDeque::new(), + max_queue_len: 10, } } @@ -43,7 +45,7 @@ impl InputBuffer { match event { Ok(event) => { // received a new event -> push it to the queue - self.queue.push_back(event); + self.add_event(event); // TODO: drop oldest events when queue becomes too full From 0aebe9c3c44cafbf6af408208596f8e6494b3454 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 14:49:09 +0100 Subject: [PATCH 196/225] Flush TCP connection after sending message --- apis/rust/node/src/daemon/tcp.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/apis/rust/node/src/daemon/tcp.rs b/apis/rust/node/src/daemon/tcp.rs index da2b749b..17ed4351 100644 --- a/apis/rust/node/src/daemon/tcp.rs +++ b/apis/rust/node/src/daemon/tcp.rs @@ -42,6 +42,7 @@ fn tcp_send(connection: &mut (impl Write + Unpin), message: &[u8]) -> std::io::R let len_raw = (message.len() as u64).to_le_bytes(); connection.write_all(&len_raw)?; connection.write_all(message)?; + connection.flush()?; Ok(()) } From 2c80b6bcc243c463df1bb4c2148a03dcac8167d6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 14:50:07 +0100 Subject: [PATCH 197/225] Log when dropping inputs in daemon --- binaries/daemon/src/listener/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/binaries/daemon/src/listener/mod.rs b/binaries/daemon/src/listener/mod.rs index 7b3d6812..82a1f1d9 100644 --- a/binaries/daemon/src/listener/mod.rs +++ b/binaries/daemon/src/listener/mod.rs @@ -218,12 +218,15 @@ where .filter(|e| matches!(e, NodeEvent::Input { .. })) .count(); let drop_n = input_event_count.saturating_sub(self.max_queue_len); - self.drop_oldest_inputs(drop_n).await?; + if drop_n > 0 { + self.drop_oldest_inputs(drop_n).await?; + } } Ok(()) } async fn drop_oldest_inputs(&mut self, number: usize) -> Result<(), eyre::ErrReport> { + tracing::debug!("dropping {number} inputs because event queue is too full"); let mut drop_tokens = Vec::new(); for i in 0..number { // find index of oldest input event From 28ae686fe408f202f66d351f7d0a59b3757e0337 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 14:54:16 +0100 Subject: [PATCH 198/225] Remove sleep between events in python examples --- examples/python-dataflow/no_webcam.py | 2 -- examples/python-operator-dataflow/no_webcam.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/examples/python-dataflow/no_webcam.py b/examples/python-dataflow/no_webcam.py index 01a99cb2..dac61cea 100755 --- a/examples/python-dataflow/no_webcam.py +++ b/examples/python-dataflow/no_webcam.py @@ -26,5 +26,3 @@ while time.time() - start < 20: print("received stop") case other: print("received unexpected event:", other) - - time.sleep(1) diff --git a/examples/python-operator-dataflow/no_webcam.py b/examples/python-operator-dataflow/no_webcam.py index 3c322c24..be43244d 100755 --- a/examples/python-operator-dataflow/no_webcam.py +++ b/examples/python-operator-dataflow/no_webcam.py @@ -29,5 +29,3 @@ while time.time() - start < 20: print("received stop") case other: print("received unexpected event:", other) - - time.sleep(1) From a84f91c572bc47b7e670736711a3df86178f3cbc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 15:56:15 +0100 Subject: [PATCH 199/225] Fix: Don't keep on polling incoming event channel after it's closed This causes the `send_out_buf` future to completely starve. --- binaries/runtime/src/operator/channel.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/binaries/runtime/src/operator/channel.rs b/binaries/runtime/src/operator/channel.rs index d8f1ca27..135260cc 100644 --- a/binaries/runtime/src/operator/channel.rs +++ b/binaries/runtime/src/operator/channel.rs @@ -38,8 +38,13 @@ impl InputBuffer { outgoing: flume::Sender, ) { let mut send_out_buf = future::Fuse::terminated(); + let mut incoming_closed = false; loop { - let next_incoming = incoming.recv_async(); + let next_incoming = if incoming_closed { + future::Fuse::terminated() + } else { + incoming.recv_async().fuse() + }; match future::select(next_incoming, send_out_buf).await { future::Either::Left((event, mut send_out)) => { match event { @@ -55,6 +60,7 @@ impl InputBuffer { } } Err(flume::RecvError::Disconnected) => { + incoming_closed = true; // the incoming channel was closed -> exit if we sent out all events already if send_out.is_terminated() && self.queue.is_empty() { break; From bcbd9f016cb1ae0d33c2e5be81df5b5ece229357 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 15:57:30 +0100 Subject: [PATCH 200/225] Fix typo in log message --- binaries/daemon/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 2338c8dd..244b0122 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -153,10 +153,10 @@ impl Daemon { let mut ctrlc_sent = false; ctrlc::set_handler(move || { if ctrlc_sent { - tracing::warn!("received second ctrc signal -> aborting immediately"); + tracing::warn!("received second ctrlc signal -> aborting immediately"); std::process::abort(); } else { - tracing::info!("received ctrc signal"); + tracing::info!("received ctrlc signal"); if ctrlc_tx.blocking_send(Event::CtrlC).is_err() { tracing::error!("failed to report ctrl-c event to dora-daemon"); } From ea537d45f674417c22f4f087e52e53e9e8c50cd7 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 15:58:19 +0100 Subject: [PATCH 201/225] Remove resolved TODO --- binaries/runtime/src/operator/channel.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/binaries/runtime/src/operator/channel.rs b/binaries/runtime/src/operator/channel.rs index 135260cc..963a6c35 100644 --- a/binaries/runtime/src/operator/channel.rs +++ b/binaries/runtime/src/operator/channel.rs @@ -52,8 +52,6 @@ impl InputBuffer { // received a new event -> push it to the queue self.add_event(event); - // TODO: drop oldest events when queue becomes too full - // if outgoing queue is empty, fill it again if send_out.is_terminated() { send_out = self.send_next_queued(&outgoing); From 598ece765e073e86c297325cfee85dadf284e8e0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 16:03:48 +0100 Subject: [PATCH 202/225] Don't panic in runtime when operator panics --- binaries/runtime/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index 91831f0f..b6c733e0 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -136,7 +136,9 @@ async fn run( OperatorEvent::Error(err) => { bail!(err.wrap_err(format!("operator {operator_id} failed"))) } - OperatorEvent::Panic(payload) => std::panic::resume_unwind(payload), + OperatorEvent::Panic(payload) => { + bail!("operator {operator_id} panicked: {payload:?}"); + } OperatorEvent::Finished { reason } => { if let StopReason::ExplicitStopAll = reason { let hlc = dora_core::message::uhlc::HLC::default(); From c70aa21d4509827e137321e105690c93efe1ca81 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 28 Feb 2023 16:04:14 +0100 Subject: [PATCH 203/225] Python example: Print number of received image and bounding box messages --- examples/python-operator-dataflow/plot.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/python-operator-dataflow/plot.py b/examples/python-operator-dataflow/plot.py index dc0c4d63..d4dc9a41 100755 --- a/examples/python-operator-dataflow/plot.py +++ b/examples/python-operator-dataflow/plot.py @@ -25,6 +25,8 @@ class Operator: def __init__(self): self.image = [] self.bboxs = [] + self.bounding_box_messages = 0 + self.image_messages = 0 def on_event( self, @@ -53,9 +55,16 @@ class Operator: frame = cv2.imdecode(frame, -1) self.image = frame + self.image_messages += 1 + print("received " + str(self.image_messages) + " images") + elif dora_input["id"] == "bbox" and len(self.image) != 0: bboxs = np.frombuffer(dora_input["data"], dtype="float32") self.bboxs = np.reshape(bboxs, (-1, 6)) + + self.bounding_box_messages += 1 + print("received " + str(self.bounding_box_messages) + " bounding boxes") + for bbox in self.bboxs: [ min_x, From 2ba397c64415dde0ec4fc02e25310dd50b2fe93f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 13:54:29 +0100 Subject: [PATCH 204/225] Fix: Don't wait on event stream thread on drop, as it might be dropped later It is not guaranteed that the `EventStream` is dropped before the `DoraNode`. If it is dropped later on the same thread, this `join` leads to a deadlock. --- apis/rust/node/src/daemon/mod.rs | 8 +++----- apis/rust/node/src/lib.rs | 17 +++-------------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/apis/rust/node/src/daemon/mod.rs b/apis/rust/node/src/daemon/mod.rs index 799848f7..39acf0fa 100644 --- a/apis/rust/node/src/daemon/mod.rs +++ b/apis/rust/node/src/daemon/mod.rs @@ -12,7 +12,6 @@ mod tcp; pub(crate) struct DaemonConnection { pub control_channel: ControlChannel, pub event_stream: EventStream, - pub(crate) event_stream_thread: JoinHandle<()>, } impl DaemonConnection { @@ -46,13 +45,12 @@ impl DaemonConnection { let control_channel = ControlChannel::init(dataflow_id, node_id, control) .wrap_err("failed to init control stream")?; - let (event_stream, event_stream_thread) = EventStream::init(dataflow_id, node_id, events) + let event_stream = EventStream::init(dataflow_id, node_id, events) .wrap_err("failed to init event stream")?; Ok(Self { control_channel, event_stream, - event_stream_thread, }) } } @@ -226,7 +224,7 @@ impl EventStream { dataflow_id: DataflowId, node_id: &NodeId, mut channel: DaemonChannel, - ) -> eyre::Result<(Self, JoinHandle<()>)> { + ) -> eyre::Result { register(dataflow_id, node_id.clone(), &mut channel)?; channel @@ -288,7 +286,7 @@ impl EventStream { } }); - Ok((EventStream { receiver: rx }, thread)) + Ok(EventStream { receiver: rx }) } pub fn recv(&mut self) -> Option { diff --git a/apis/rust/node/src/lib.rs b/apis/rust/node/src/lib.rs index 32a57095..4304215c 100644 --- a/apis/rust/node/src/lib.rs +++ b/apis/rust/node/src/lib.rs @@ -1,5 +1,3 @@ -use std::thread::JoinHandle; - use daemon::{ControlChannel, DaemonConnection}; pub use daemon::{Event, EventStream}; pub use dora_core; @@ -19,7 +17,6 @@ pub struct DoraNode { node_config: NodeRunConfig, control_channel: ControlChannel, hlc: uhlc::HLC, - event_stream_thread: Option>, } impl DoraNode { @@ -46,7 +43,6 @@ impl DoraNode { let DaemonConnection { control_channel, event_stream, - event_stream_thread, } = DaemonConnection::init(dataflow_id, &node_id, &daemon_communication) .wrap_err("failed to connect to dora-daemon")?; @@ -55,7 +51,6 @@ impl DoraNode { node_config: run_config, control_channel, hlc: uhlc::HLC::default(), - event_stream_thread: Some(event_stream_thread), }; Ok((node, event_stream)) } @@ -128,15 +123,9 @@ impl DoraNode { impl Drop for DoraNode { #[tracing::instrument(skip(self), fields(self.id = %self.id))] fn drop(&mut self) { - match self.control_channel.report_stop() { - Ok(()) => { - if let Some(thread) = self.event_stream_thread.take() { - if let Err(panic) = thread.join() { - std::panic::resume_unwind(panic); - } - } - } - Err(err) => tracing::error!("{err:?}"), + tracing::info!("reporting node stop for node `{}`", self.id); + if let Err(err) = self.control_channel.report_stop() { + tracing::error!("{err:?}") } } } From 125638f27e41093c1d7fbe2aefd399a30f21b967 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:04:43 +0100 Subject: [PATCH 205/225] Report event stream errors through channel instead of panicking Avoids the need to catch the thread's panics. --- apis/rust/node/src/daemon/mod.rs | 161 ++++++++++++++++++------------- 1 file changed, 95 insertions(+), 66 deletions(-) diff --git a/apis/rust/node/src/daemon/mod.rs b/apis/rust/node/src/daemon/mod.rs index 39acf0fa..15f1040b 100644 --- a/apis/rust/node/src/daemon/mod.rs +++ b/apis/rust/node/src/daemon/mod.rs @@ -5,7 +5,7 @@ use dora_core::{ }; use eyre::{bail, eyre, Context}; use shared_memory_server::{Shmem, ShmemClient, ShmemConf}; -use std::{marker::PhantomData, net::TcpStream, thread::JoinHandle, time::Duration}; +use std::{marker::PhantomData, net::TcpStream, time::Duration}; mod tcp; @@ -213,7 +213,13 @@ fn register( Ok(()) } -type EventItem = (NodeEvent, std::sync::mpsc::Sender<()>); +enum EventItem { + NodeEvent { + event: NodeEvent, + ack_channel: std::sync::mpsc::Sender<()>, + }, + FatalError(eyre::Report), +} pub struct EventStream { receiver: flume::Receiver, @@ -234,55 +240,70 @@ impl EventStream { let (tx, rx) = flume::bounded(0); let mut drop_tokens = Vec::new(); - let thread = std::thread::spawn(move || loop { - let daemon_request = DaemonRequest::NextEvent { - drop_tokens: std::mem::take(&mut drop_tokens), - }; - let event: NodeEvent = match channel.request(&daemon_request) { - Ok(DaemonReply::NodeEvent(event)) => event, - Ok(DaemonReply::Closed) => { - tracing::debug!("Event stream closed"); - break; - } - Ok(other) => { - let err = eyre!("unexpected control reply: {other:?}"); - tracing::warn!("{err:?}"); - continue; + let node_id = node_id.clone(); + std::thread::spawn(move || { + let result = loop { + let daemon_request = DaemonRequest::NextEvent { + drop_tokens: std::mem::take(&mut drop_tokens), + }; + let event: NodeEvent = match channel.request(&daemon_request) { + Ok(DaemonReply::NodeEvent(event)) => event, + Ok(DaemonReply::Closed) => { + tracing::debug!("Event stream closed for node ID `{node_id}`"); + break Ok(()); + } + Ok(other) => { + let err = eyre!("unexpected control reply: {other:?}"); + tracing::warn!("{err:?}"); + continue; + } + Err(err) => { + let err = eyre!(err).wrap_err("failed to receive incoming event"); + tracing::warn!("{err:?}"); + continue; + } + }; + let drop_token = match &event { + NodeEvent::Input { + data: Some(data), .. + } => Some(data.drop_token.clone()), + NodeEvent::Stop + | NodeEvent::InputClosed { .. } + | NodeEvent::Input { data: None, .. } => None, + }; + + let (drop_tx, drop_rx) = std::sync::mpsc::channel(); + match tx.send(EventItem::NodeEvent { + event, + ack_channel: drop_tx, + }) { + Ok(()) => {} + Err(_) => { + // receiving end of channel was closed + break Ok(()); + } } - Err(err) => { - let err = eyre!(err).wrap_err("failed to receive incoming event"); - tracing::warn!("{err:?}"); - continue; - } - }; - let drop_token = match &event { - NodeEvent::Input { - data: Some(data), .. - } => Some(data.drop_token.clone()), - NodeEvent::Stop - | NodeEvent::InputClosed { .. } - | NodeEvent::Input { data: None, .. } => None, - }; - let (drop_tx, drop_rx) = std::sync::mpsc::channel(); - match tx.send((event, drop_tx)) { - Ok(()) => {} - Err(_) => { - // receiving end of channel was closed - break; + match drop_rx.recv_timeout(Duration::from_secs(30)) { + Ok(()) => break Err(eyre!("Node API should not send anything on ACK channel")), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + tracing::warn!("timeout while waiting for input ACK"); + } + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {} // expected result } - } - match drop_rx.recv_timeout(Duration::from_secs(30)) { - Ok(()) => panic!("Node API should not send anything on ACK channel"), - Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { - tracing::warn!("timeout while waiting for input ACK"); + if let Some(token) = drop_token { + drop_tokens.push(token); + } + }; + if let Err(err) = result { + if let Err(flume::SendError(item)) = tx.send(EventItem::FatalError(err)) { + let err = match item { + EventItem::FatalError(err) => err, + _ => unreachable!(), + }; + tracing::error!("failed to report fatal EventStream error: {err:?}"); } - Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {} // expected result - } - - if let Some(token) = drop_token { - drop_tokens.push(token); } }); @@ -300,28 +321,36 @@ impl EventStream { } fn recv_common(&mut self, event: Result) -> Option { - let (node_event, drop_sender) = match event { - Ok(d) => d, - Err(flume::RecvError::Disconnected) => return None, + let event = match event { + Ok(event) => event, + Err(flume::RecvError::Disconnected) => { + tracing::info!("event channel disconnected"); + return None; + } }; - let event = match node_event { - NodeEvent::Stop => Event::Stop, - NodeEvent::InputClosed { id } => Event::InputClosed { id }, - NodeEvent::Input { id, metadata, data } => { - let mapped = data - .map(|d| unsafe { MappedInputData::map(&d.shared_memory_id, d.len) }) - .transpose(); - match mapped { - Ok(mapped) => Event::Input { - id, - metadata, - data: mapped.map(|data| Data { - data, - _drop: drop_sender, - }), - }, - Err(err) => Event::Error(format!("{err:?}")), + let event = match event { + EventItem::NodeEvent { event, ack_channel } => match event { + NodeEvent::Stop => Event::Stop, + NodeEvent::InputClosed { id } => Event::InputClosed { id }, + NodeEvent::Input { id, metadata, data } => { + let mapped = data + .map(|d| unsafe { MappedInputData::map(&d.shared_memory_id, d.len) }) + .transpose(); + match mapped { + Ok(mapped) => Event::Input { + id, + metadata, + data: mapped.map(|data| Data { + data, + _drop: ack_channel, + }), + }, + Err(err) => Event::Error(format!("{err:?}")), + } } + }, + EventItem::FatalError(err) => { + Event::Error(format!("fatal event stream error: {err:?}")) } }; From 085a0723db5785296af3fbf5b8bc81718ea81d45 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:35:28 +0100 Subject: [PATCH 206/225] Re-add joining of event stream thread, now based on shared ownership and using a timeout Without joining the thread it is killed suddenly by the OS when the executable exits. This causes interrupted connections to the daemon, which leads to some errors in the log messages. This commit fixes this by joining the event stream thread again, with the following improvements: - Instead of joining the thread directly on drop, we now do the joining in a second thread and report the join result over a channel. This allows us to use a timeout when waiting for the join result in the drop implementation, so that we don't block indefinitely. - We now share the ownership of the join handle between the node and the event stream. This way, its drop handler is only run after both instances were dropped. This way we avoid the deadlock that happened when joining the event steam thread before the event stream instance was dropped. --- apis/rust/node/src/daemon/mod.rs | 61 +++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/apis/rust/node/src/daemon/mod.rs b/apis/rust/node/src/daemon/mod.rs index 15f1040b..2f1616aa 100644 --- a/apis/rust/node/src/daemon/mod.rs +++ b/apis/rust/node/src/daemon/mod.rs @@ -4,8 +4,9 @@ use dora_core::{ message::Metadata, }; use eyre::{bail, eyre, Context}; +use flume::RecvTimeoutError; use shared_memory_server::{Shmem, ShmemClient, ShmemConf}; -use std::{marker::PhantomData, net::TcpStream, time::Duration}; +use std::{marker::PhantomData, net::TcpStream, sync::Arc, time::Duration}; mod tcp; @@ -42,11 +43,14 @@ impl DaemonConnection { } }; - let control_channel = ControlChannel::init(dataflow_id, node_id, control) + let mut control_channel = ControlChannel::init(dataflow_id, node_id, control) .wrap_err("failed to init control stream")?; - let event_stream = EventStream::init(dataflow_id, node_id, events) - .wrap_err("failed to init event stream")?; + let (event_stream, event_stream_thread_handle) = + EventStream::init(dataflow_id, node_id, events) + .wrap_err("failed to init event stream")?; + + control_channel.event_stream_thread_handle = Some(event_stream_thread_handle); Ok(Self { control_channel, @@ -57,6 +61,7 @@ impl DaemonConnection { pub(crate) struct ControlChannel { channel: DaemonChannel, + event_stream_thread_handle: Option>, } impl ControlChannel { @@ -68,7 +73,10 @@ impl ControlChannel { ) -> eyre::Result { register(dataflow_id, node_id.clone(), &mut channel)?; - Ok(Self { channel }) + Ok(Self { + channel, + event_stream_thread_handle: None, + }) } pub fn report_stop(&mut self) -> eyre::Result<()> { @@ -223,6 +231,7 @@ enum EventItem { pub struct EventStream { receiver: flume::Receiver, + _thread_handle: Arc, } impl EventStream { @@ -230,7 +239,7 @@ impl EventStream { dataflow_id: DataflowId, node_id: &NodeId, mut channel: DaemonChannel, - ) -> eyre::Result { + ) -> eyre::Result<(Self, Arc)> { register(dataflow_id, node_id.clone(), &mut channel)?; channel @@ -241,7 +250,7 @@ impl EventStream { let (tx, rx) = flume::bounded(0); let mut drop_tokens = Vec::new(); let node_id = node_id.clone(); - std::thread::spawn(move || { + let join_handle = std::thread::spawn(move || { let result = loop { let daemon_request = DaemonRequest::NextEvent { drop_tokens: std::mem::take(&mut drop_tokens), @@ -307,7 +316,15 @@ impl EventStream { } }); - Ok(EventStream { receiver: rx }) + let thread_handle = EventStreamThreadHandle::new(join_handle); + + Ok(( + EventStream { + receiver: rx, + _thread_handle: thread_handle.clone(), + }, + thread_handle, + )) } pub fn recv(&mut self) -> Option { @@ -423,3 +440,31 @@ impl std::ops::Deref for MappedInputData<'_> { unsafe { &self.memory.as_slice()[..self.len] } } } + +struct EventStreamThreadHandle(flume::Receiver>); +impl EventStreamThreadHandle { + fn new(join_handle: std::thread::JoinHandle<()>) -> Arc { + let (tx, rx) = flume::bounded(1); + std::thread::spawn(move || { + let _ = tx.send(join_handle.join()); + }); + Arc::new(Self(rx)) + } +} + +impl Drop for EventStreamThreadHandle { + fn drop(&mut self) { + match self.0.recv_timeout(Duration::from_secs(2)) { + Ok(Ok(())) => {} + Ok(Err(_)) => { + tracing::error!("event stream thread panicked"); + } + Err(RecvTimeoutError::Timeout) => { + tracing::warn!("timeout while waiting for event stream thread"); + } + Err(RecvTimeoutError::Disconnected) => { + tracing::warn!("event stream thread result channel closed unexpectedly"); + } + } + } +} From 6b0a7de34aa87704d858e1f8a37d45be456c7fb4 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:42:55 +0100 Subject: [PATCH 207/225] Remove unneeded `free_dora_node` from C++ node API The cxx crate is able to invoke the Rust destructors from C++ --- apis/c++/node/src/lib.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/apis/c++/node/src/lib.rs b/apis/c++/node/src/lib.rs index 10aba1ba..7d827510 100644 --- a/apis/c++/node/src/lib.rs +++ b/apis/c++/node/src/lib.rs @@ -33,7 +33,6 @@ mod ffi { type DoraEvent<'a>; fn init_dora_node() -> Result; - fn free_dora_node(node: DoraNode); fn next_event(inputs: &mut Box) -> Box>; fn event_type(event: &Box) -> DoraEventType; @@ -57,10 +56,6 @@ fn init_dora_node() -> eyre::Result { }) } -fn free_dora_node(node: ffi::DoraNode) { - let _ = node; -} - pub struct Events(EventStream); fn next_event(events: &mut Box) -> Box { From df17e2b2cfa641decb49803320f9ac30618d09d1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:48:59 +0100 Subject: [PATCH 208/225] Enable tracing subscriber for C and C++ node APIs by default --- apis/c++/node/Cargo.toml | 4 ++++ apis/c/node/Cargo.toml | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apis/c++/node/Cargo.toml b/apis/c++/node/Cargo.toml index 5d1ce54f..595a7548 100644 --- a/apis/c++/node/Cargo.toml +++ b/apis/c++/node/Cargo.toml @@ -8,6 +8,10 @@ edition = "2021" [lib] crate-type = ["staticlib"] +[features] +default = ["tracing-subscriber"] +tracing-subscriber = ["dora-node-api/tracing-subscriber"] + [dependencies] cxx = "1.0.73" dora-node-api = { workspace = true } diff --git a/apis/c/node/Cargo.toml b/apis/c/node/Cargo.toml index 7579960f..3d95d245 100644 --- a/apis/c/node/Cargo.toml +++ b/apis/c/node/Cargo.toml @@ -9,11 +9,14 @@ license = "Apache-2.0" [lib] crate-type = ["staticlib"] +[features] +default = ["tracing-subscriber"] +tracing-subscriber = ["dora-node-api/tracing-subscriber"] + [dependencies] eyre = "0.6.8" flume = "0.10.14" tracing = "0.1.33" [dependencies.dora-node-api] -default-features = false -path = "../../rust/node" +workspace = true From 4b03b773ceb21faad6916e0b2180fe8defa74cab Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:50:31 +0100 Subject: [PATCH 209/225] Minor improvents to C++ node API --- apis/c++/node/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apis/c++/node/src/lib.rs b/apis/c++/node/src/lib.rs index 7d827510..2e5f1314 100644 --- a/apis/c++/node/src/lib.rs +++ b/apis/c++/node/src/lib.rs @@ -47,11 +47,11 @@ mod ffi { fn init_dora_node() -> eyre::Result { let (node, events) = dora_node_api::DoraNode::init_from_env()?; - let inputs = Events(events); + let events = Events(events); let send_output = OutputSender(node); Ok(ffi::DoraNode { - events: Box::new(inputs), + events: Box::new(events), send_output: Box::new(send_output), }) } @@ -64,7 +64,7 @@ fn next_event(events: &mut Box) -> Box { pub struct DoraEvent<'a>(Option>); -fn event_type(event: &Box) -> ffi::DoraEventType { +fn event_type(event: &DoraEvent) -> ffi::DoraEventType { match &event.0 { Some(event) => match event { Event::Stop => ffi::DoraEventType::Stop, From 4d13a29dead3bf8933838bc1115e5058ee659efb Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:52:13 +0100 Subject: [PATCH 210/225] Improve error messages and fix clippy warnings in daemon --- binaries/daemon/src/lib.rs | 2 +- binaries/daemon/src/listener/mod.rs | 32 +++++++++++++++++------------ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 244b0122..9a6f66bf 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -127,7 +127,7 @@ impl Daemon { } }); - let (dataflow_errors, _) = future::try_join(run_result, spawn_result).await?; + let (dataflow_errors, ()) = future::try_join(run_result, spawn_result).await?; if dataflow_errors.is_empty() { Ok(()) diff --git a/binaries/daemon/src/listener/mod.rs b/binaries/daemon/src/listener/mod.rs index 82a1f1d9..3bec44ef 100644 --- a/binaries/daemon/src/listener/mod.rs +++ b/binaries/daemon/src/listener/mod.rs @@ -237,13 +237,11 @@ where .unwrap_or_else(|| panic!("no input event found in drop iteration {i}")); // remove that event - if let Some(event) = self.queue.remove(index) { - if let NodeEvent::Input { - data: Some(data), .. - } = event - { - drop_tokens.push(data.drop_token); - } + if let Some(NodeEvent::Input { + data: Some(data), .. + }) = self.queue.remove(index) + { + drop_tokens.push(data.drop_token); } } self.report_drop_tokens(drop_tokens).await?; @@ -255,7 +253,9 @@ where match message { DaemonRequest::Register { .. } => { let reply = DaemonReply::Result(Err("unexpected register message".into())); - self.send_reply(reply).await?; + self.send_reply(reply) + .await + .wrap_err("failed to send register reply")?; } DaemonRequest::Stopped => self.process_daemon_event(DaemonNodeEvent::Stopped).await?, DaemonRequest::CloseOutputs(outputs) => { @@ -281,7 +281,9 @@ where .await .wrap_err("failed to receive prepare output reply")?; // tracing::debug!("prepare latency: {:?}", start.elapsed()?); - self.send_reply(reply).await?; + self.send_reply(reply) + .await + .wrap_err("failed to send PrepareOutputMessage reply")?; } DaemonRequest::SendPreparedMessage { id } => { let (reply_sender, reply) = oneshot::channel(); @@ -290,7 +292,7 @@ where self.send_reply( reply .await - .wrap_err("failed to receive send output reply")?, + .wrap_err("failed to receive SendPreparedMessage reply")?, ) .await?; } @@ -311,7 +313,9 @@ where .send_daemon_event(event) .await .map_err(|_| "failed to receive send_empty_message reply".to_owned()); - self.send_reply(DaemonReply::Result(result)).await?; + self.send_reply(DaemonReply::Result(result)) + .await + .wrap_err("failed to send SendEmptyMessage reply")?; } DaemonRequest::Subscribe => { let (tx, rx) = flume::bounded(100); @@ -342,7 +346,9 @@ where } }; - self.send_reply(reply).await?; + self.send_reply(reply) + .await + .wrap_err("failed to send NextEvent reply")?; } } Ok(()) @@ -385,7 +391,7 @@ where self.connection .send_reply(reply) .await - .wrap_err("failed to send reply to node") + .wrap_err_with(|| format!("failed to send reply to node `{}`", self.node_id)) } async fn send_shared_memory_event( From 32bbd145778685fa8e2f540061d7bd185d8e857e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 14:53:47 +0100 Subject: [PATCH 211/225] Enable tracing for C, C++, and `rust-dataflow-url` examples --- examples/c++-dataflow/run.rs | 19 ++++++++++++++++++- examples/c-dataflow/run.rs | 15 +++++++++++++++ examples/rust-dataflow-url/run.rs | 15 +++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index 2698b6d9..e1c782e9 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -4,11 +4,17 @@ use std::{ ffi::{OsStr, OsString}, path::Path, }; +use tracing::metadata::LevelFilter; +use tracing_subscriber::Layer; #[tokio::main] async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing")?; + if cfg!(windows) { - eprintln!("The c++ example does not work on Windows currently because of a linker error"); + tracing::error!( + "The c++ example does not work on Windows currently because of a linker error" + ); return Ok(()); } @@ -275,3 +281,14 @@ pub fn library_filename>(name: S) -> OsString { string.push(DLL_SUFFIX); string } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(LevelFilter::DEBUG); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index d0bd1ecf..32e8a7e2 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -4,9 +4,13 @@ use std::{ ffi::{OsStr, OsString}, path::Path, }; +use tracing::metadata::LevelFilter; +use tracing_subscriber::Layer; #[tokio::main] async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing")?; + let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) .wrap_err("failed to set working dir")?; @@ -134,3 +138,14 @@ pub fn library_filename>(name: S) -> OsString { string.push(DLL_SUFFIX); string } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(LevelFilter::DEBUG); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index 3c384b6d..2710f5cf 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -1,8 +1,12 @@ use eyre::{bail, Context}; use std::path::Path; +use tracing::metadata::LevelFilter; +use tracing_subscriber::Layer; #[tokio::main] async fn main() -> eyre::Result<()> { + set_up_tracing().wrap_err("failed to set up tracing")?; + let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) .wrap_err("failed to set working dir")?; @@ -37,3 +41,14 @@ async fn build_package(package: &str) -> eyre::Result<()> { }; Ok(()) } + +fn set_up_tracing() -> eyre::Result<()> { + use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt; + + let stdout_log = tracing_subscriber::fmt::layer() + .pretty() + .with_filter(LevelFilter::DEBUG); + let subscriber = tracing_subscriber::Registry::default().with(stdout_log); + tracing::subscriber::set_global_default(subscriber) + .context("failed to set tracing global subscriber") +} From 3ad61509194e751a9e93b1fedd893b766664e454 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 15:19:57 +0100 Subject: [PATCH 212/225] Fix: don't error if the elapsed time is negative (which can happen on some systems) --- examples/benchmark/sink/src/main.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/benchmark/sink/src/main.rs b/examples/benchmark/sink/src/main.rs index 05772dcb..7ce064b8 100644 --- a/examples/benchmark/sink/src/main.rs +++ b/examples/benchmark/sink/src/main.rs @@ -47,7 +47,14 @@ fn main() -> eyre::Result<()> { } n += 1; - latencies.push(metadata.timestamp().get_time().to_system_time().elapsed()?); + latencies.push( + metadata + .timestamp() + .get_time() + .to_system_time() + .elapsed() + .unwrap_or_default(), + ); } Event::InputClosed { id } => { println!("Input `{id}` was closed"); From 6462d858412e906039fc50f19583aee3d5368e48 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 17:21:19 +0100 Subject: [PATCH 213/225] CI: Run CLI commands in single step --- .github/workflows/ci.yml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37eb22ae..4bbcfdd2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -96,27 +96,19 @@ jobs: cargo install --path binaries/runtime cargo install --path binaries/cli - - run: dora-cli up - - - name: "Test dora list" - run: dora-cli list - - - name: "Test new command" - run: | - dora-cli new test_project - - name: "Test start and stop command" timeout-minutes: 30 run: | + dora-cli up + dora-cli list cd test_project + dora-cli new test_project cargo build --all --config "patch.'https://github.com/dora-rs/dora.git'.dora-node-api.path=\"../apis/rust/node\"" --config "patch.'https://github.com/dora-rs/dora.git'.dora-operator-api.path=\"../apis/rust/operator\"" UUID=$(dora-cli start dataflow.yml) sleep 10 dora-cli stop $UUID cd .. - - - name: "Test dora destroy" - run: dora-cli destroy + dora-cli destroy examples-remote: name: "Examples (Remote)" From 8670d85f63ca3aff777fffcab125c1e8f69844cc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 17:44:43 +0100 Subject: [PATCH 214/225] CI: Fix command order in CLI test job --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4bbcfdd2..75be7a71 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -96,13 +96,13 @@ jobs: cargo install --path binaries/runtime cargo install --path binaries/cli - - name: "Test start and stop command" + - name: "Test CLI" timeout-minutes: 30 run: | dora-cli up dora-cli list - cd test_project dora-cli new test_project + cd test_project cargo build --all --config "patch.'https://github.com/dora-rs/dora.git'.dora-node-api.path=\"../apis/rust/node\"" --config "patch.'https://github.com/dora-rs/dora.git'.dora-operator-api.path=\"../apis/rust/operator\"" UUID=$(dora-cli start dataflow.yml) sleep 10 From 4bbb724876a096103e5d34046f789efb26e24cb8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 1 Mar 2023 19:07:11 +0100 Subject: [PATCH 215/225] Flush TCP connections on sending in coordinator and daemon too --- binaries/coordinator/src/tcp_utils.rs | 1 + binaries/daemon/src/tcp_utils.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/binaries/coordinator/src/tcp_utils.rs b/binaries/coordinator/src/tcp_utils.rs index 31f5e3b5..57003f7b 100644 --- a/binaries/coordinator/src/tcp_utils.rs +++ b/binaries/coordinator/src/tcp_utils.rs @@ -7,6 +7,7 @@ pub async fn tcp_send(connection: &mut TcpStream, message: &[u8]) -> std::io::Re let len_raw = (message.len() as u64).to_le_bytes(); connection.write_all(&len_raw).await?; connection.write_all(message).await?; + connection.flush().await?; Ok(()) } diff --git a/binaries/daemon/src/tcp_utils.rs b/binaries/daemon/src/tcp_utils.rs index b6c31e30..db327c58 100644 --- a/binaries/daemon/src/tcp_utils.rs +++ b/binaries/daemon/src/tcp_utils.rs @@ -7,6 +7,7 @@ pub async fn tcp_send( let len_raw = (message.len() as u64).to_le_bytes(); connection.write_all(&len_raw).await?; connection.write_all(message).await?; + connection.flush().await?; Ok(()) } From 97dab13a93655d7ea953edc732634609ca66f236 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 09:41:51 +0100 Subject: [PATCH 216/225] Wait for spawned listener tasks before exiting coordinator Ensures that we don't exit while some listener task is still sending out a reply. --- binaries/coordinator/src/lib.rs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 0eef2142..b8a6467e 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -13,7 +13,7 @@ use dora_core::{ }, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; -use futures::{Stream, StreamExt}; +use futures::{stream::FuturesUnordered, Stream, StreamExt}; use futures_concurrency::stream::Merge; use run::SpawnedDataflow; use std::{ @@ -21,7 +21,7 @@ use std::{ path::{Path, PathBuf}, time::Duration, }; -use tokio::{net::TcpStream, sync::mpsc}; +use tokio::{net::TcpStream, sync::mpsc, task::JoinHandle}; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; use uuid::Uuid; @@ -48,13 +48,23 @@ pub async fn run(args: Args) -> eyre::Result<()> { .with_file_name("dora-runtime") }); + let mut tasks = FuturesUnordered::new(); + // start in daemon mode - start(&runtime_path).await?; + start(&runtime_path, &tasks).await?; + + tracing::debug!("coordinator main loop finished, waiting on spawned tasks"); + while let Some(join_result) = tasks.next().await { + if let Err(err) = join_result { + tracing::error!("task panicked: {err}"); + } + } + tracing::debug!("all spawned tasks finished, exiting.."); Ok(()) } -async fn start(runtime_path: &Path) -> eyre::Result<()> { +async fn start(runtime_path: &Path, tasks: &FuturesUnordered>) -> eyre::Result<()> { let ctrlc_events = set_up_ctrlc_handler()?; let listener = listener::create_listener(DORA_COORDINATOR_PORT_DEFAULT).await?; @@ -101,7 +111,8 @@ async fn start(runtime_path: &Path) -> eyre::Result<()> { connection.set_nodelay(true)?; let events_tx = daemon_events_tx.clone(); if let Some(events_tx) = events_tx { - tokio::spawn(listener::handle_connection(connection, events_tx)); + let task = tokio::spawn(listener::handle_connection(connection, events_tx)); + tasks.push(task); } else { tracing::warn!( "ignoring new daemon connection because events_tx was closed already" From aae4a66705ebec8d29163473b51e6443330ebba0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 09:51:07 +0100 Subject: [PATCH 217/225] Wait for control connections to finish too --- binaries/coordinator/src/control.rs | 23 +++++++++++++++++++---- binaries/coordinator/src/lib.rs | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/binaries/coordinator/src/control.rs b/binaries/coordinator/src/control.rs index 60e59fd3..51a8219c 100644 --- a/binaries/coordinator/src/control.rs +++ b/binaries/coordinator/src/control.rs @@ -6,6 +6,7 @@ use dora_core::topics::{ControlRequest, ControlRequestReply}; use eyre::{eyre, Context}; use futures::{ future::{self, Either}, + stream::FuturesUnordered, FutureExt, Stream, StreamExt, }; use futures_concurrency::future::Race; @@ -13,20 +14,30 @@ use std::{io::ErrorKind, net::SocketAddr}; use tokio::{ net::{TcpListener, TcpStream}, sync::{mpsc, oneshot}, + task::JoinHandle, }; use tokio_stream::wrappers::ReceiverStream; pub(crate) async fn control_events( control_listen_addr: SocketAddr, + tasks: &FuturesUnordered>, ) -> eyre::Result> { let (tx, rx) = mpsc::channel(10); - tokio::spawn(listen(control_listen_addr, tx)); + let (finish_tx, mut finish_rx) = mpsc::channel(1); + tasks.push(tokio::spawn(listen(control_listen_addr, tx, finish_tx))); + tasks.push(tokio::spawn(async move { + while let Some(()) = finish_rx.recv().await {} + })); Ok(ReceiverStream::new(rx).map(Event::Control)) } -async fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) { +async fn listen( + control_listen_addr: SocketAddr, + tx: mpsc::Sender, + _finish_tx: mpsc::Sender<()>, +) { let result = TcpListener::bind(control_listen_addr) .await .wrap_err("failed to listen for control messages"); @@ -51,7 +62,7 @@ async fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) match connection.wrap_err("failed to connect") { Ok((connection, _)) => { let tx = tx.clone(); - tokio::spawn(handle_requests(connection, tx)); + tokio::spawn(handle_requests(connection, tx, _finish_tx.clone())); } Err(err) => { if tx.blocking_send(err.into()).is_err() { @@ -62,7 +73,11 @@ async fn listen(control_listen_addr: SocketAddr, tx: mpsc::Sender) } } -async fn handle_requests(mut connection: TcpStream, tx: mpsc::Sender) { +async fn handle_requests( + mut connection: TcpStream, + tx: mpsc::Sender, + _finish_tx: mpsc::Sender<()>, +) { loop { let next_request = tcp_receive(&mut connection).map(Either::Left); let coordinator_stopped = tx.closed().map(Either::Right); diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index b8a6467e..701385af 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -78,7 +78,7 @@ async fn start(runtime_path: &Path, tasks: &FuturesUnordered>) -> let mut daemon_events_tx = Some(daemon_events_tx); let daemon_events = ReceiverStream::new(daemon_events); - let control_events = control::control_events(control_socket_addr()) + let control_events = control::control_events(control_socket_addr(), tasks) .await .wrap_err("failed to create control events")?; From 9e4f01318accc05479e640538584d6a1870f1ba0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 11:08:45 +0100 Subject: [PATCH 218/225] Fix: Move `InputBuffer` break condition to end of loop Fixes a deadlock issue when the `incoming` channel is closed first. The problem was that the `send_out_buf` was set to `Fuse::terminated` without breaking the loop, so on the next loop iteration both futures were already terminated. --- binaries/runtime/src/operator/channel.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/binaries/runtime/src/operator/channel.rs b/binaries/runtime/src/operator/channel.rs index 963a6c35..7780a5dc 100644 --- a/binaries/runtime/src/operator/channel.rs +++ b/binaries/runtime/src/operator/channel.rs @@ -59,10 +59,6 @@ impl InputBuffer { } Err(flume::RecvError::Disconnected) => { incoming_closed = true; - // the incoming channel was closed -> exit if we sent out all events already - if send_out.is_terminated() && self.queue.is_empty() { - break; - } } } @@ -76,6 +72,9 @@ impl InputBuffer { Err(flume::SendError(_)) => break, }, }; + if incoming_closed && send_out_buf.is_terminated() && self.queue.is_empty() { + break; + } } } From a2a6dba3b3a8fe1190dec2e8910b1bfff2c7e8a9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 11:39:50 +0100 Subject: [PATCH 219/225] Use problem matcher and cache for CLI tests too --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 75be7a71..7bd32889 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,6 +88,10 @@ jobs: export DEBIAN_FRONTEND=noninteractive sudo apt-get install -y libacl1-dev + - uses: r7kamura/rust-problem-matchers@v1.1.0 + - run: cargo --version --verbose + - uses: Swatinem/rust-cache@v2 + - name: "Build cli and binaries" timeout-minutes: 30 run: | From b994ab60a761ce36c1493d3fccb19a5e589a78c8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 12:24:24 +0100 Subject: [PATCH 220/225] Add support for `shell`-based nodes again --- binaries/daemon/src/spawn.rs | 59 ++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 224b0ecf..31f6a8cc 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -12,6 +12,8 @@ use eyre::WrapErr; use std::{env::consts::EXE_EXTENSION, path::Path, process::Stdio}; use tokio::sync::mpsc; +const SHELL_SOURCE: &str = "shell"; + pub async fn spawn_node( dataflow_id: DataflowId, working_dir: &Path, @@ -35,22 +37,43 @@ pub async fn spawn_node( let mut child = match node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { - let resolved_path = if source_is_url(&n.source) { - // try to download the shared library - let target_path = Path::new("build") - .join(node_id.to_string()) - .with_extension(EXE_EXTENSION); - download_file(&n.source, &target_path) - .await - .wrap_err("failed to download custom node")?; - target_path.clone() - } else { - resolve_path(&n.source, working_dir) - .wrap_err_with(|| format!("failed to resolve node source `{}`", n.source))? + let mut command = match n.source.as_str() { + SHELL_SOURCE => { + if cfg!(target_os = "windows") { + let mut cmd = tokio::process::Command::new("cmd"); + cmd.args(["/C", &n.args.clone().unwrap_or_default()]); + cmd + } else { + let mut cmd = tokio::process::Command::new("sh"); + cmd.args(["-c", &n.args.clone().unwrap_or_default()]); + cmd + } + } + source => { + let resolved_path = if source_is_url(source) { + // try to download the shared library + let target_path = Path::new("build") + .join(node_id.to_string()) + .with_extension(EXE_EXTENSION); + download_file(source, &target_path) + .await + .wrap_err("failed to download custom node")?; + target_path.clone() + } else { + resolve_path(source, working_dir).wrap_err_with(|| { + format!("failed to resolve node source `{}`", source) + })? + }; + + tracing::info!("spawning {}", resolved_path.display()); + let mut cmd = tokio::process::Command::new(&resolved_path); + if let Some(args) = &n.args { + cmd.args(args.split_ascii_whitespace()); + } + cmd + } }; - tracing::info!("spawning {}", resolved_path.display()); - let mut command = tokio::process::Command::new(&resolved_path); command.current_dir(working_dir); command.stdin(Stdio::null()); let node_config = NodeConfig { @@ -59,9 +82,7 @@ pub async fn spawn_node( run_config: n.run_config.clone(), daemon_communication, }; - if let Some(args) = &n.args { - command.args(args.split_ascii_whitespace()); - } + command.env( "DORA_NODE_CONFIG", serde_yaml::to_string(&node_config).wrap_err("failed to serialize node config")?, @@ -75,8 +96,8 @@ pub async fn spawn_node( } command.spawn().wrap_err_with(move || { format!( - "failed to run source path: `{}` with args `{}`", - resolved_path.display(), + "failed to run `{}` with args `{}`", + n.source, n.args.as_deref().unwrap_or_default() ) })? From 1213d70efb89489efacb4525e6a9187156142af7 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 17:37:48 +0100 Subject: [PATCH 221/225] Mark operator `Event` type as `non_exhaustive` --- apis/rust/operator/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/apis/rust/operator/src/lib.rs b/apis/rust/operator/src/lib.rs index e850549b..4744fd73 100644 --- a/apis/rust/operator/src/lib.rs +++ b/apis/rust/operator/src/lib.rs @@ -8,6 +8,7 @@ use types::{Metadata, Output, SendOutput}; pub mod raw; +#[non_exhaustive] pub enum Event<'a> { Input { id: &'a str, data: &'a [u8] }, Stop, From 7232a31b57c034751538992cba01b88b217ae58e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 17:45:29 +0100 Subject: [PATCH 222/225] Send `InputClosed` events to operators too --- apis/c/operator/operator_types.h | 3 +++ apis/rust/operator/src/lib.rs | 2 ++ apis/rust/operator/src/raw.rs | 2 ++ apis/rust/operator/types/src/lib.rs | 1 + binaries/runtime/src/lib.rs | 18 ++++++++++++++++++ binaries/runtime/src/operator/mod.rs | 9 +++++++++ binaries/runtime/src/operator/shared_lib.rs | 7 +++++++ 7 files changed, 42 insertions(+) diff --git a/apis/c/operator/operator_types.h b/apis/c/operator/operator_types.h index ca95fc2e..4c07b808 100644 --- a/apis/c/operator/operator_types.h +++ b/apis/c/operator/operator_types.h @@ -111,6 +111,9 @@ typedef struct RawEvent { /** */ Input_t * input; + /** */ + Vec_uint8_t input_closed; + /** */ bool stop; } RawEvent_t; diff --git a/apis/rust/operator/src/lib.rs b/apis/rust/operator/src/lib.rs index 4744fd73..9db91b97 100644 --- a/apis/rust/operator/src/lib.rs +++ b/apis/rust/operator/src/lib.rs @@ -8,9 +8,11 @@ use types::{Metadata, Output, SendOutput}; pub mod raw; +#[derive(Debug)] #[non_exhaustive] pub enum Event<'a> { Input { id: &'a str, data: &'a [u8] }, + InputClosed { id: &'a str }, Stop, } diff --git a/apis/rust/operator/src/raw.rs b/apis/rust/operator/src/raw.rs index dee87bb6..c3db60e4 100644 --- a/apis/rust/operator/src/raw.rs +++ b/apis/rust/operator/src/raw.rs @@ -41,6 +41,8 @@ pub unsafe fn dora_on_event( id: &input.id, data, } + } else if let Some(input_id) = &event.input_closed { + Event::InputClosed { id: input_id } } else if event.stop { Event::Stop } else { diff --git a/apis/rust/operator/types/src/lib.rs b/apis/rust/operator/types/src/lib.rs index dc8117a2..e1b39cba 100644 --- a/apis/rust/operator/types/src/lib.rs +++ b/apis/rust/operator/types/src/lib.rs @@ -58,6 +58,7 @@ pub struct OnEventFn( #[derive(Debug)] pub struct RawEvent { pub input: Option>, + pub input_closed: Option, pub stop: bool, } diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index b6c733e0..1892dd1d 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -237,6 +237,24 @@ async fn run( let operator_id = OperatorId::from(operator_id.to_owned()); let input_id = DataId::from(input_id.to_owned()); + let Some(operator_channel) = operator_channels.get(&operator_id) else { + tracing::warn!("received input {id} for unknown operator"); + continue; + }; + if let Err(err) = operator_channel + .send_async(operator::IncomingEvent::InputClosed { + input_id: input_id.clone(), + }) + .await + .wrap_err_with(|| { + format!( + "failed to send InputClosed({input_id}) to operator `{operator_id}`" + ) + }) + { + tracing::warn!("{err}"); + } + if let Some(open_inputs) = open_operator_inputs.get_mut(&operator_id) { open_inputs.remove(&input_id); if open_inputs.is_empty() { diff --git a/binaries/runtime/src/operator/mod.rs b/binaries/runtime/src/operator/mod.rs index df2548e2..2803e482 100644 --- a/binaries/runtime/src/operator/mod.rs +++ b/binaries/runtime/src/operator/mod.rs @@ -97,6 +97,9 @@ pub enum IncomingEvent { metadata: Metadata<'static>, data: Option>, }, + InputClosed { + input_id: DataId, + }, } impl IntoPy for IncomingEvent { @@ -124,6 +127,12 @@ impl IntoPy for IncomingEvent { .unwrap(); "INPUT" } + Self::InputClosed { input_id } => { + dict.set_item("id", input_id.to_string()) + .wrap_err("failed to add input ID") + .unwrap(); + "INPUT_CLOSED" + } }; dict.set_item("type", ty) diff --git a/binaries/runtime/src/operator/shared_lib.rs b/binaries/runtime/src/operator/shared_lib.rs index 9e6e5666..aaeb8da6 100644 --- a/binaries/runtime/src/operator/shared_lib.rs +++ b/binaries/runtime/src/operator/shared_lib.rs @@ -170,6 +170,7 @@ impl<'lib> SharedLibraryOperator<'lib> { let operator_event = match event { IncomingEvent::Stop => dora_operator_api_types::RawEvent { input: None, + input_closed: None, stop: true, }, IncomingEvent::Input { @@ -190,9 +191,15 @@ impl<'lib> SharedLibraryOperator<'lib> { }; dora_operator_api_types::RawEvent { input: Some(Box::new(operator_input).into()), + input_closed: None, stop: false, } } + IncomingEvent::InputClosed { input_id } => dora_operator_api_types::RawEvent { + input_closed: Some(input_id.to_string().into()), + input: None, + stop: false, + }, }; let send_output = SendOutput { From bb8d6edb40fe6f09155fca4694ba7ec75e609514 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 17:49:17 +0100 Subject: [PATCH 223/225] Use more reliable `InputClosed` event instead of timer as exit condition We might not receive any `random` input if the startup of the operator is delayed and the source node is already finished. --- examples/rust-dataflow/operator/src/lib.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/rust-dataflow/operator/src/lib.rs b/examples/rust-dataflow/operator/src/lib.rs index 681213c5..81a1a1f0 100644 --- a/examples/rust-dataflow/operator/src/lib.rs +++ b/examples/rust-dataflow/operator/src/lib.rs @@ -1,14 +1,12 @@ #![warn(unsafe_op_in_unsafe_fn)] use dora_operator_api::{register_operator, DoraOperator, DoraOutputSender, DoraStatus, Event}; -use std::time::{Duration, Instant}; register_operator!(ExampleOperator); #[derive(Debug, Default)] struct ExampleOperator { ticks: usize, - last_random_at: Option, } impl DoraOperator for ExampleOperator { @@ -33,19 +31,22 @@ impl DoraOperator for ExampleOperator { self.ticks ); output_sender.send("status".into(), output.into_bytes())?; - self.last_random_at = Some(Instant::now()); } other => eprintln!("ignoring unexpected input {other}"), }, Event::Stop => {} - } - - if let Some(last_random_at) = self.last_random_at { - if last_random_at.elapsed() > Duration::from_secs(1) { - // looks like the node sending the random values finished -> exit too - return Ok(DoraStatus::Stop); + Event::InputClosed { id } => { + println!("input `{id}` was closed"); + if *id == "random" { + println!("`random` input was closed -> exiting"); + return Ok(DoraStatus::Stop); + } + } + other => { + println!("received unknown event {other:?}"); } } + Ok(DoraStatus::Continue) } } From d5ceb4428abb4a33a699874401ed2e5c92890016 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 17:52:39 +0100 Subject: [PATCH 224/225] Fix: Update `open_inputs` even if there is no subscriber for closed input --- binaries/daemon/src/lib.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 9a6f66bf..4db816cd 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -754,16 +754,14 @@ where .flat_map(|(_, v)| v) .collect(); for (receiver_id, input_id) in downstream_nodes { - let Some(channel) = dataflow.subscribe_channels.get(receiver_id) else { - continue; + if let Some(channel) = dataflow.subscribe_channels.get(receiver_id) { + let _ = channel + .send_async(daemon_messages::NodeEvent::InputClosed { + id: input_id.clone(), + }) + .await; }; - let _ = channel - .send_async(daemon_messages::NodeEvent::InputClosed { - id: input_id.clone(), - }) - .await; - if let Some(open_inputs) = dataflow.open_inputs.get_mut(receiver_id) { open_inputs.remove(input_id); if open_inputs.is_empty() { From 9439b08f162e9c6e3f68d4e858847207c3bb1a6c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 6 Mar 2023 17:55:19 +0100 Subject: [PATCH 225/225] Fix: Keep track of `InputClosed` and `Stop` messages and send them on subscribe These two messages can be essential for correctness. For example, a node might not finish properly when an `InputClosed` event is lost. So we need to always send them, even if the target node was not subscribed yet when the event occurred. --- binaries/daemon/src/lib.rs | 87 +++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 16 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 4db816cd..81bc9699 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -252,9 +252,7 @@ impl Daemon { } Event::CtrlC => { for dataflow in self.running.values_mut() { - for (_node_id, channel) in dataflow.subscribe_channels.drain() { - let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; - } + dataflow.stop_all().await; } } } @@ -295,10 +293,7 @@ impl Daemon { .running .get_mut(&dataflow_id) .wrap_err_with(|| format!("no running dataflow with ID `{dataflow_id}`"))?; - - for (_node_id, channel) in dataflow.subscribe_channels.drain() { - let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; - } + dataflow.stop_all().await; Result::<(), eyre::Report>::Ok(()) }; let reply = DaemonCoordinatorReply::StopResult( @@ -408,15 +403,7 @@ impl Daemon { ) -> eyre::Result<()> { match event { DaemonNodeEvent::Subscribe { event_sender } => { - let result = match self.running.get_mut(&dataflow_id) { - Some(dataflow) => { - dataflow.subscribe_channels.insert(node_id, event_sender); - Ok(()) - } - None => Err(format!( - "subscribe failed: no running dataflow with ID `{dataflow_id}`" - )), - }; + let result = self.subscribe(dataflow_id, node_id, event_sender).await; let _ = reply_sender.send(DaemonReply::Result(result)); } DaemonNodeEvent::CloseOutputs(outputs) => { @@ -448,6 +435,55 @@ impl Daemon { Ok(()) } + async fn subscribe( + &mut self, + dataflow_id: Uuid, + node_id: NodeId, + event_sender: flume::Sender, + ) -> Result<(), String> { + let dataflow = self.running.get_mut(&dataflow_id).ok_or_else(|| { + format!("subscribe failed: no running dataflow with ID `{dataflow_id}`") + })?; + + // some inputs might have been closed already -> report those events + let closed_inputs = dataflow + .mappings + .values() + .flatten() + .filter(|(node, _)| node == &node_id) + .map(|(_, input)| input) + .filter(|input| { + dataflow + .open_inputs + .get(&node_id) + .map(|open_inputs| !open_inputs.contains(*input)) + .unwrap_or(true) + }); + for input_id in closed_inputs { + let _ = event_sender + .send_async(daemon_messages::NodeEvent::InputClosed { + id: input_id.clone(), + }) + .await; + } + + // if a stop event was already sent for the dataflow, send it to + // the newly connected node too + if dataflow.stop_sent { + let _ = event_sender + .send_async(daemon_messages::NodeEvent::Stop) + .await; + } + + if dataflow.stop_sent || dataflow.open_inputs(&node_id).is_empty() { + tracing::debug!("Received subscribe message for closed event stream"); + } else { + dataflow.subscribe_channels.insert(node_id, event_sender); + } + + Ok(()) + } + #[tracing::instrument(skip(self))] async fn handle_node_stop( &mut self, @@ -781,6 +817,25 @@ pub struct RunningDataflow { running_nodes: BTreeSet, /// Keep handles to all timer tasks of this dataflow to cancel them on drop. _timer_handles: Vec>, + stop_sent: bool, + + /// Used in `open_inputs`. + /// + /// TODO: replace this with a constant once `BTreeSet::new` is `const` on stable. + empty_set: BTreeSet, +} + +impl RunningDataflow { + async fn stop_all(&mut self) { + for (_node_id, channel) in self.subscribe_channels.drain() { + let _ = channel.send_async(daemon_messages::NodeEvent::Stop).await; + } + self.stop_sent = true; + } + + fn open_inputs(&self, node_id: &NodeId) -> &BTreeSet { + self.open_inputs.get(node_id).unwrap_or(&self.empty_set) + } } type OutputId = (NodeId, DataId);