| /* |
| * mc.h: vlib reliable sequenced multicast distributed applications |
| * |
| * Copyright (c) 2010 Cisco and/or its affiliates. |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef included_vlib_mc_h |
| #define included_vlib_mc_h |
| |
| #include <vppinfra/elog.h> |
| #include <vppinfra/fifo.h> |
| #include <vppinfra/mhash.h> |
| #include <vlib/node.h> |
| |
| #ifndef MC_EVENT_LOGGING |
| #define MC_EVENT_LOGGING 1 |
| #endif |
| |
| always_inline uword |
| mc_need_byte_swap (void) |
| { return CLIB_ARCH_IS_LITTLE_ENDIAN; } |
| |
| /* |
| * Used to uniquely identify hosts. |
| * For IP4 this would be ip4_address plus tcp/udp port. |
| */ |
| typedef union { |
| u8 as_u8[8]; |
| u64 as_u64; |
| } mc_peer_id_t; |
| |
| always_inline mc_peer_id_t |
| mc_byte_swap_peer_id (mc_peer_id_t i) |
| { |
| /* Peer id is already in network byte order. */ |
| return i; |
| } |
| |
| always_inline int |
| mc_peer_id_compare (mc_peer_id_t a, mc_peer_id_t b) |
| { |
| return memcmp (a.as_u8, b.as_u8, sizeof (a.as_u8)); |
| } |
| |
| /* Assert mastership. Lowest peer_id amount all peers wins mastership. |
| Only sent/received over mastership channel (MC_TRANSPORT_MASTERSHIP). |
| So, we don't need a message opcode. */ |
| typedef CLIB_PACKED (struct { |
| /* Peer id asserting mastership. */ |
| mc_peer_id_t peer_id; |
| |
| /* Global sequence number asserted. */ |
| u32 global_sequence; |
| }) mc_msg_master_assert_t; |
| |
| always_inline void |
| mc_byte_swap_msg_master_assert (mc_msg_master_assert_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->global_sequence = clib_byte_swap_u32 (r->global_sequence); |
| } |
| } |
| |
| #define foreach_mc_msg_type \ |
| _ (master_assert) \ |
| _ (join_or_leave_request) \ |
| _ (join_reply) \ |
| _ (user_request) \ |
| _ (user_ack) \ |
| _ (catchup_request) \ |
| _ (catchup_reply) |
| |
| typedef enum { |
| #define _(f) MC_MSG_TYPE_##f, |
| foreach_mc_msg_type |
| #undef _ |
| } mc_relay_msg_type_t; |
| |
| /* Request to join a given stream. Multicast over MC_TRANSPORT_JOIN. */ |
| typedef CLIB_PACKED (struct { |
| mc_peer_id_t peer_id; |
| |
| mc_relay_msg_type_t type : 32; /* MC_MSG_TYPE_join_or_leave_request */ |
| |
| /* Stream to join or leave. */ |
| u32 stream_index; |
| |
| /* join = 1, leave = 0 */ |
| u8 is_join; |
| }) mc_msg_join_or_leave_request_t; |
| |
| always_inline void |
| mc_byte_swap_msg_join_or_leave_request (mc_msg_join_or_leave_request_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->type = clib_byte_swap_u32 (r->type); |
| r->stream_index = clib_byte_swap_u32 (r->stream_index); |
| } |
| } |
| |
| /* Join reply. Multicast over MC_TRANSPORT_JOIN. */ |
| typedef CLIB_PACKED (struct { |
| mc_peer_id_t peer_id; |
| |
| mc_relay_msg_type_t type : 32; /* MC_MSG_TYPE_join_reply */ |
| |
| u32 stream_index; |
| |
| /* Peer ID to contact to catchup with this stream. */ |
| mc_peer_id_t catchup_peer_id; |
| }) mc_msg_join_reply_t; |
| |
| always_inline void |
| mc_byte_swap_msg_join_reply (mc_msg_join_reply_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->type = clib_byte_swap_u32 (r->type); |
| r->stream_index = clib_byte_swap_u32 (r->stream_index); |
| r->catchup_peer_id = mc_byte_swap_peer_id (r->catchup_peer_id); |
| } |
| } |
| |
| /* Generic (application) request. Multicast over MC_TRANSPORT_USER_REQUEST_TO_RELAY and then |
| relayed by relay master after filling in global sequence number. */ |
| typedef CLIB_PACKED (struct { |
| mc_peer_id_t peer_id; |
| |
| u32 stream_index; |
| |
| /* Global sequence number as filled in by relay master. */ |
| u32 global_sequence; |
| |
| /* Local sequence number as filled in by peer sending message. */ |
| u32 local_sequence; |
| |
| /* Size of request data. */ |
| u32 n_data_bytes; |
| |
| /* Opaque request data. */ |
| u8 data[0]; |
| }) mc_msg_user_request_t; |
| |
| always_inline void |
| mc_byte_swap_msg_user_request (mc_msg_user_request_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->stream_index = clib_byte_swap_u32 (r->stream_index); |
| r->global_sequence = clib_byte_swap_u32 (r->global_sequence); |
| r->local_sequence = clib_byte_swap_u32 (r->local_sequence); |
| r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes); |
| } |
| } |
| |
| /* Sent unicast over ACK channel. */ |
| typedef CLIB_PACKED (struct { |
| mc_peer_id_t peer_id; |
| u32 global_sequence; |
| u32 stream_index; |
| u32 local_sequence; |
| i32 seq_cmp_result; |
| }) mc_msg_user_ack_t; |
| |
| always_inline void |
| mc_byte_swap_msg_user_ack (mc_msg_user_ack_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->stream_index = clib_byte_swap_u32 (r->stream_index); |
| r->global_sequence = clib_byte_swap_u32 (r->global_sequence); |
| r->local_sequence = clib_byte_swap_u32 (r->local_sequence); |
| r->seq_cmp_result = clib_byte_swap_i32 (r->seq_cmp_result); |
| } |
| } |
| |
| /* Sent/received unicast over catchup channel (e.g. using TCP). */ |
| typedef CLIB_PACKED (struct { |
| mc_peer_id_t peer_id; |
| u32 stream_index; |
| }) mc_msg_catchup_request_t; |
| |
| always_inline void |
| mc_byte_swap_msg_catchup_request (mc_msg_catchup_request_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->stream_index = clib_byte_swap_u32 (r->stream_index); |
| } |
| } |
| |
| /* Sent/received unicast over catchup channel. */ |
| typedef CLIB_PACKED (struct { |
| mc_peer_id_t peer_id; |
| |
| u32 stream_index; |
| |
| /* Last global sequence number included in catchup data. */ |
| u32 last_global_sequence_included; |
| |
| /* Size of catchup data. */ |
| u32 n_data_bytes; |
| |
| /* Catchup data. */ |
| u8 data[0]; |
| }) mc_msg_catchup_reply_t; |
| |
| always_inline void |
| mc_byte_swap_msg_catchup_reply (mc_msg_catchup_reply_t * r) |
| { |
| if (mc_need_byte_swap ()) |
| { |
| r->peer_id = mc_byte_swap_peer_id (r->peer_id); |
| r->stream_index = clib_byte_swap_u32 (r->stream_index); |
| r->last_global_sequence_included = clib_byte_swap_u32 (r->last_global_sequence_included); |
| r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes); |
| } |
| } |
| |
| typedef struct _mc_serialize_msg { |
| /* Name for this type. */ |
| char * name; |
| |
| /* Functions to serialize/unserialize data. */ |
| serialize_function_t * serialize; |
| serialize_function_t * unserialize; |
| |
| /* Maximum message size in bytes when serialized. |
| If zero then this will be set to the largest sent message. */ |
| u32 max_n_bytes_serialized; |
| |
| /* Opaque to use for first argument to serialize/unserialize function. */ |
| u32 opaque; |
| |
| /* Index in global message vector. */ |
| u32 global_index; |
| |
| /* Registration list */ |
| struct _mc_serialize_msg * next_registration; |
| } mc_serialize_msg_t; |
| |
| typedef struct { |
| /* Index into global message vector. */ |
| u32 global_index; |
| } mc_serialize_stream_msg_t; |
| |
| #define MC_SERIALIZE_MSG(x,...) \ |
| __VA_ARGS__ mc_serialize_msg_t x; \ |
| static void __mc_serialize_msg_registration_##x (void) \ |
| __attribute__((__constructor__)) ; \ |
| static void __mc_serialize_msg_registration_##x (void) \ |
| { \ |
| vlib_main_t * vm = vlib_get_main(); \ |
| x.next_registration = vm->mc_msg_registrations; \ |
| vm->mc_msg_registrations = &x; \ |
| } \ |
| __VA_ARGS__ mc_serialize_msg_t x |
| |
| typedef enum { |
| MC_TRANSPORT_MASTERSHIP, |
| MC_TRANSPORT_JOIN, |
| MC_TRANSPORT_USER_REQUEST_TO_RELAY, |
| MC_TRANSPORT_USER_REQUEST_FROM_RELAY, |
| MC_N_TRANSPORT_TYPE, |
| } mc_transport_type_t; |
| |
| typedef struct { |
| clib_error_t * (* tx_buffer) (void * opaque, mc_transport_type_t type, u32 buffer_index); |
| |
| clib_error_t * (* tx_ack) (void * opaque, mc_peer_id_t peer_id, u32 buffer_index); |
| |
| /* Returns catchup opaque. */ |
| uword (* catchup_request_fun) (void * opaque, u32 stream_index, mc_peer_id_t catchup_peer_id); |
| |
| void (* catchup_send_fun) (void * opaque, uword catchup_opaque, u8 * data_vector); |
| |
| /* Opaque passed to callbacks. */ |
| void * opaque; |
| |
| mc_peer_id_t our_ack_peer_id; |
| mc_peer_id_t our_catchup_peer_id; |
| |
| /* Max packet size (MTU) for this transport. |
| For IP this is interface MTU less IP + UDP header size. */ |
| u32 max_packet_size; |
| |
| format_function_t * format_peer_id; |
| } mc_transport_t; |
| |
| typedef struct { |
| /* Count of messages received from this peer from the past/future |
| (with seq_cmp != 0). */ |
| u64 n_msgs_from_past; |
| u64 n_msgs_from_future; |
| } mc_stream_peer_stats_t; |
| |
| typedef struct { |
| /* ID of this peer. */ |
| mc_peer_id_t id; |
| |
| /* The last sequence we received from this peer. */ |
| u32 last_sequence_received; |
| |
| mc_stream_peer_stats_t stats, stats_last_clear; |
| } mc_stream_peer_t; |
| |
| typedef struct { |
| u32 buffer_index; |
| |
| /* Cached copy of local sequence number from buffer. */ |
| u32 local_sequence; |
| |
| /* Number of times this buffer has been sent (retried). */ |
| u32 n_retries; |
| |
| /* Previous/next retries in doubly-linked list. */ |
| u32 prev_index, next_index; |
| |
| /* Bitmap of all peers which have acked this msg */ |
| uword * unacked_by_peer_bitmap; |
| |
| /* Message send or resend time */ |
| f64 sent_at; |
| } mc_retry_t; |
| |
| typedef struct { |
| /* Number of retries sent for this stream. */ |
| u64 n_retries; |
| } mc_stream_stats_t; |
| |
| struct mc_main_t; |
| struct mc_stream_t; |
| |
| typedef struct { |
| /* Stream name. */ |
| char * name; |
| |
| /* Number of outstanding messages. */ |
| u32 window_size; |
| |
| /* Retry interval, in seconds */ |
| f64 retry_interval; |
| |
| /* Retry limit */ |
| u32 retry_limit; |
| |
| /* User rx buffer callback */ |
| void (* rx_buffer) (struct mc_main_t * mc_main, |
| struct mc_stream_t * stream, |
| mc_peer_id_t peer_id, |
| u32 buffer_index); |
| |
| /* User callback to create a snapshot */ |
| u8 * (* catchup_snapshot) (struct mc_main_t *mc_main, |
| u8 * snapshot_vector, |
| u32 last_global_sequence_included); |
| |
| /* User callback to replay a snapshot */ |
| void (* catchup) (struct mc_main_t *mc_main, |
| u8 * snapshot_data, |
| u32 n_snapshot_data_bytes); |
| |
| /* Callback to save a snapshot for offline replay */ |
| void (* save_snapshot) (struct mc_main_t *mc_main, |
| u32 is_catchup, |
| u8 * snapshot_data, |
| u32 n_snapshot_data_bytes); |
| |
| /* Called when a peer dies */ |
| void (* peer_died) (struct mc_main_t * mc_main, |
| struct mc_stream_t * stream, |
| mc_peer_id_t peer_id); |
| } mc_stream_config_t; |
| |
| #define foreach_mc_stream_state \ |
| _ (invalid) \ |
| _ (name_known) \ |
| _ (join_in_progress) \ |
| _ (catchup) \ |
| _ (ready) |
| |
| typedef enum { |
| #define _(f) MC_STREAM_STATE_##f, |
| foreach_mc_stream_state |
| #undef _ |
| } mc_stream_state_t; |
| |
| typedef struct mc_stream_t { |
| mc_stream_config_t config; |
| |
| mc_stream_state_t state; |
| |
| /* Index in stream pool. */ |
| u32 index; |
| |
| /* Stream index 0 is always for MC internal use. */ |
| #define MC_STREAM_INDEX_INTERNAL 0 |
| |
| mc_retry_t * retry_pool; |
| |
| /* Head and tail index of retry pool. */ |
| u32 retry_head_index, retry_tail_index; |
| |
| /* |
| * Country club for recently retired messages |
| * If the set of peers is expanding and a new peer |
| * misses a message, we can easily retire the FIFO |
| * element before we even know about the new peer |
| */ |
| mc_retry_t * retired_fifo; |
| |
| /* Hash mapping local sequence to retry pool index. */ |
| uword * retry_index_by_local_sequence; |
| |
| /* catch-up fifo of VLIB buffer indices. |
| start recording when catching up. */ |
| u32 * catchup_fifo; |
| |
| mc_stream_stats_t stats, stats_last_clear; |
| |
| /* Peer pool. */ |
| mc_stream_peer_t * peers; |
| |
| /* Bitmap with ones for all peers in peer pool. */ |
| uword * all_peer_bitmap; |
| |
| /* Map of 64 bit id to index in stream pool. */ |
| mhash_t peer_index_by_id; |
| |
| /* Timeout, in case we're alone in the world */ |
| f64 join_timeout; |
| |
| vlib_one_time_waiting_process_t * procs_waiting_for_join_done; |
| |
| vlib_one_time_waiting_process_t * procs_waiting_for_open_window; |
| |
| /* Next sequence number to use */ |
| u32 our_local_sequence; |
| |
| /* |
| * Last global sequence we processed. |
| * When supplying catchup data, we need to tell |
| * the client precisely where to start replaying |
| */ |
| u32 last_global_sequence_processed; |
| |
| /* Vector of unique messages we've sent on this stream. */ |
| mc_serialize_stream_msg_t * stream_msgs; |
| |
| /* Vector global message index into per stream message index. */ |
| u32 * stream_msg_index_by_global_index; |
| |
| /* Hashed by message name. */ |
| uword * stream_msg_index_by_name; |
| |
| u64 user_requests_sent; |
| u64 user_requests_received; |
| } mc_stream_t; |
| |
| always_inline void |
| mc_stream_free (mc_stream_t * s) |
| { |
| pool_free (s->retry_pool); |
| hash_free (s->retry_index_by_local_sequence); |
| clib_fifo_free (s->catchup_fifo); |
| pool_free (s->peers); |
| mhash_free (&s->peer_index_by_id); |
| vec_free (s->procs_waiting_for_join_done); |
| vec_free (s->procs_waiting_for_open_window); |
| } |
| |
| always_inline void |
| mc_stream_init (mc_stream_t * s) |
| { |
| memset (s, 0, sizeof (s[0])); |
| s->retry_head_index = s->retry_tail_index = ~0; |
| } |
| |
| typedef struct { |
| u32 stream_index; |
| u32 catchup_opaque; |
| u8 *catchup_snapshot; |
| } mc_catchup_process_arg_t; |
| |
| typedef enum { |
| MC_RELAY_STATE_NEGOTIATE, |
| MC_RELAY_STATE_MASTER, |
| MC_RELAY_STATE_SLAVE, |
| } mc_relay_state_t; |
| |
| typedef struct { |
| mc_peer_id_t peer_id; |
| |
| f64 time_last_master_assert_received; |
| } mc_mastership_peer_t; |
| |
| typedef struct { |
| u32 stream_index; |
| u32 buffer_index; |
| } mc_stream_and_buffer_t; |
| |
| typedef struct mc_main_t { |
| mc_relay_state_t relay_state; |
| |
| /* Mastership */ |
| u32 we_can_be_relay_master; |
| |
| u64 relay_master_peer_id; |
| |
| mc_mastership_peer_t * mastership_peers; |
| |
| /* Map of 64 bit id to index in stream pool. */ |
| mhash_t mastership_peer_index_by_id; |
| |
| /* The transport we're using. */ |
| mc_transport_t transport; |
| |
| /* Last-used global sequence number. */ |
| u32 relay_global_sequence; |
| |
| /* Vector of streams. */ |
| mc_stream_t * stream_vector; |
| |
| /* Hash table mapping stream name to pool index. */ |
| uword * stream_index_by_name; |
| |
| uword * procs_waiting_for_stream_name_by_name; |
| |
| vlib_one_time_waiting_process_t ** procs_waiting_for_stream_name_pool; |
| |
| int joins_in_progress; |
| |
| mc_catchup_process_arg_t * catchup_process_args; |
| |
| /* Node indices for mastership, join ager, |
| retry and catchup processes. */ |
| u32 mastership_process; |
| u32 join_ager_process; |
| u32 retry_process; |
| u32 catchup_process; |
| u32 unserialize_process; |
| |
| /* Global vector of messages. */ |
| mc_serialize_msg_t ** global_msgs; |
| |
| /* Hash table mapping message name to index. */ |
| uword * global_msg_index_by_name; |
| |
| /* Shared serialize/unserialize main. */ |
| serialize_main_t serialize_mains[VLIB_N_RX_TX]; |
| |
| vlib_serialize_buffer_main_t serialize_buffer_mains[VLIB_N_RX_TX]; |
| |
| /* Convenience variables */ |
| struct vlib_main_t * vlib_main; |
| elog_main_t * elog_main; |
| |
| /* Maps 64 bit peer id to elog string table offset for this formatted peer id. */ |
| mhash_t elog_id_by_peer_id; |
| |
| uword *elog_id_by_msg_name; |
| |
| /* For mc_unserialize. */ |
| mc_stream_and_buffer_t * mc_unserialize_stream_and_buffers; |
| } mc_main_t; |
| |
| always_inline mc_stream_t * |
| mc_stream_by_name (mc_main_t * m, char * name) |
| { |
| uword * p = hash_get (m->stream_index_by_name, name); |
| return p ? vec_elt_at_index (m->stream_vector, p[0]) : 0; |
| } |
| |
| always_inline mc_stream_t * |
| mc_stream_by_index (mc_main_t * m, u32 i) |
| { |
| return i < vec_len (m->stream_vector) ? m->stream_vector + i : 0; |
| } |
| |
| always_inline void |
| mc_clear_stream_stats (mc_main_t * m) |
| { |
| mc_stream_t * s; |
| mc_stream_peer_t * p; |
| vec_foreach (s, m->stream_vector) |
| { |
| s->stats_last_clear = s->stats; |
| pool_foreach (p, s->peers, ({ |
| p->stats_last_clear = p->stats; |
| })); |
| } |
| } |
| |
| /* Declare all message handlers. */ |
| #define _(f) void mc_msg_##f##_handler (mc_main_t * mcm, mc_msg_##f##_t * msg, u32 buffer_index); |
| foreach_mc_msg_type |
| #undef _ |
| |
| u32 mc_stream_join (mc_main_t * mcm, mc_stream_config_t *); |
| |
| void mc_stream_leave (mc_main_t * mcm, u32 stream_index); |
| |
| void mc_wait_for_stream_ready (mc_main_t * m, char * stream_name); |
| |
| u32 mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index); |
| |
| void mc_main_init (mc_main_t * mcm, char * tag); |
| |
| void mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master); |
| |
| void * mc_get_vlib_buffer (struct vlib_main_t * vm, u32 n_bytes, u32 * bi_return); |
| |
| format_function_t format_mc_main; |
| |
| clib_error_t * |
| mc_serialize_internal (mc_main_t * mc, |
| u32 stream_index, |
| u32 multiple_messages_per_vlib_buffer, |
| mc_serialize_msg_t * msg, |
| ...); |
| |
| clib_error_t * |
| mc_serialize_va (mc_main_t * mc, |
| u32 stream_index, |
| u32 multiple_messages_per_vlib_buffer, |
| mc_serialize_msg_t * msg, |
| va_list * va); |
| |
| #define mc_serialize_stream(mc,si,msg,args...) \ |
| mc_serialize_internal((mc),(si),(0),(msg),(msg)->serialize,args) |
| |
| #define mc_serialize(mc,msg,args...) \ |
| mc_serialize_internal((mc),(~0),(0),(msg),(msg)->serialize,args) |
| |
| #define mc_serialize2(mc,add,msg,args...) \ |
| mc_serialize_internal((mc),(~0),(add),(msg),(msg)->serialize,args) |
| |
| void mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index); |
| uword mc_unserialize_message (mc_main_t * mcm, mc_stream_t * s, |
| serialize_main_t * m); |
| |
| serialize_function_t serialize_mc_main, unserialize_mc_main; |
| |
| always_inline uword |
| mc_max_message_size_in_bytes (mc_main_t * mcm) |
| { return mcm->transport.max_packet_size - sizeof (mc_msg_user_request_t); } |
| |
| always_inline word |
| mc_serialize_n_bytes_left (mc_main_t * mcm, serialize_main_t * m) |
| { return mc_max_message_size_in_bytes (mcm) - serialize_vlib_buffer_n_bytes (m); } |
| |
| void unserialize_mc_stream (serialize_main_t * m, va_list * va); |
| void mc_stream_join_process_hold (void); |
| |
| #endif /* included_vlib_mc_h */ |