temp fix for avoiding shared mem crash

5 months ago · ea446f910b
--- a/apis/rust/node/src/node/mod.rs
+++ b/apis/rust/node/src/node/mod.rs
@@ -75,7 +75,7 @@ pub struct DoraNode {
    control_channel: ControlChannel,
    clock: Arc<uhlc::HLC>,

    sent_out_shared_memory: HashMap<DropToken, ShmemHandle>,
    sent_out_shared_memory: HashMap<DropToken, (ShmemHandle, uhlc::Timestamp)>,
    drop_stream: DropStream,
    cache: VecDeque<ShmemHandle>,

@@ -403,8 +403,9 @@ impl DoraNode {
            .wrap_err_with(|| format!("failed to send output {output_id}"))?;

        if let Some((shared_memory, drop_token)) = shmem {
            let timestamp = self.clock.new_timestamp();
            self.sent_out_shared_memory
                .insert(drop_token, shared_memory);
                .insert(drop_token, (shared_memory, timestamp));
        }

        Ok(())
@@ -469,6 +470,11 @@ impl DoraNode {
    }

    fn allocate_shared_memory(&mut self, data_len: usize) -> eyre::Result<ShmemHandle> {
        // TODO: TEMPORARY - This is a workaround for memory pressure issues
        // Need deeper investigation into why drop tokens aren't being processed in time
        // First, try to process any pending drop tokens to free up memory
        let _ = self.handle_finished_drop_tokens();

        let cache_index = self
            .cache
            .iter()
@@ -496,10 +502,14 @@ impl DoraNode {
    }

    fn handle_finished_drop_tokens(&mut self) -> eyre::Result<()> {
        // TODO: TEMPORARY - More aggressive cleanup when we have many pending segments
        // This is a band-aid solution - need to investigate root cause of memory retention
        const MAX_PENDING_SEGMENTS: usize = 100; // Increased limit for high-frequency data

        loop {
            match self.drop_stream.try_recv() {
                Ok(token) => match self.sent_out_shared_memory.remove(&token) {
                    Some(region) => self.add_to_cache(region),
                    Some((region, _timestamp)) => self.add_to_cache(region),
                    None => tracing::warn!("received unknown finished drop token `{token:?}`"),
                },
                Err(flume::TryRecvError::Empty) => break,
@@ -508,6 +518,43 @@ impl DoraNode {
                }
            }
        }

        // TODO: TEMPORARY - If we have too many pending segments, force cleanup of the oldest ones
        // WARNING: This forceful cleanup might drop segments still in use by receivers
        // Need to implement proper reference counting or lifecycle management
        if self.sent_out_shared_memory.len() > MAX_PENDING_SEGMENTS {
            tracing::warn!(
                "Too many pending shared memory segments ({}), forcing cleanup of oldest segments",
                self.sent_out_shared_memory.len()
            );

            // TODO: TEMPORARY FIX - Remove oldest entries beyond the limit
            // This properly removes the oldest segments based on timestamp, but the root cause
            // of why drop tokens aren't being received in time still needs investigation
            let to_remove = self.sent_out_shared_memory.len() - MAX_PENDING_SEGMENTS;

            // Collect all entries with their timestamps and sort by age
            let mut entries: Vec<_> = self
                .sent_out_shared_memory
                .iter()
                .map(|(token, (_, timestamp))| (*token, *timestamp))
                .collect();
            entries.sort_by_key(|(_, timestamp)| *timestamp);

            // Remove the oldest entries
            let keys_to_remove: Vec<_> = entries
                .into_iter()
                .take(to_remove)
                .map(|(token, _)| token)
                .collect();

            for key in keys_to_remove {
                if let Some((region, _)) = self.sent_out_shared_memory.remove(&key) {
                    self.add_to_cache(region);
                }
            }
        }

        Ok(())
    }

@@ -561,7 +608,9 @@ impl Drop for DoraNode {

            match self.drop_stream.recv_timeout(Duration::from_secs(2)) {
                Ok(token) => {
                    self.sent_out_shared_memory.remove(&token);
                    if let Some((region, _)) = self.sent_out_shared_memory.remove(&token) {
                        self.add_to_cache(region);
                    }
                }
                Err(flume::RecvTimeoutError::Disconnected) => {
                    tracing::warn!(