uorb: fix several race conditions during topic initialization

Possible race conditions (they all happen between the check of existence
of a topic and trying to create the node):
- single instance, with multiple advertisers during the first advertise:
  both advertisers see the topic as non-existent and try to advertise it.
  One of them will fail, leading to an error message.
  This is the cause for telemetry_status advert failure seen in SITL in
  rare cases.
- multi-instance: subscription to non-existing instance -> px4_open fails,
  and the subscriber tries to create the node. If during that time a
  publisher publishes that instance, the subscriber will get (instance+1)
  (or fails if the max number of instances is exceeded).
  This is a race that goes pretty much unnoticed.
- multi-instance: 2 publishers can get the same instance (if is_published()
  is false in case both have not published data yet).
  This can also go unnoticed.
  Therefore the patch changes where _advertised is set: it is now set
  directly during the advertisement instead of during publication.
This commit is contained in:
Beat Küng 2019-11-21 07:57:39 +01:00 committed by Daniel Agar
parent 63b2befeed
commit 6c8048d057
6 changed files with 41 additions and 20 deletions

View File

@ -55,7 +55,7 @@ uORB::DeviceMaster::~DeviceMaster()
}
int
uORB::DeviceMaster::advertise(const struct orb_metadata *meta, int *instance, int priority)
uORB::DeviceMaster::advertise(const struct orb_metadata *meta, bool is_advertiser, int *instance, int priority)
{
int ret = PX4_ERROR;
@ -119,22 +119,45 @@ uORB::DeviceMaster::advertise(const struct orb_metadata *meta, int *instance, in
delete node;
if (ret == -EEXIST) {
/* if the node exists already, get the existing one and check if
* something has been published yet. */
/* if the node exists already, get the existing one and check if it's advertised. */
uORB::DeviceNode *existing_node = getDeviceNodeLocked(meta, group_tries);
if (existing_node != nullptr && !existing_node->is_advertised()) {
/* nothing has been published yet, lets claim it */
existing_node->set_priority(priority);
/*
* We can claim an existing node in these cases:
* - The node is not advertised (yet). It means there is already one or more subscribers or it was
* unadvertised.
* - We are a single-instance advertiser requesting the first instance.
* (Usually we don't end up here, but we might in case of a race condition between 2
* advertisers).
* - We are a subscriber requesting a certain instance.
* (Also we usually don't end up in that case, but we might in case of a race condtion
* between an advertiser and subscriber).
*/
bool is_single_instance_advertiser = is_advertiser && !instance;
if (existing_node != nullptr &&
(!existing_node->is_advertised() || is_single_instance_advertiser || !is_advertiser)) {
if (is_advertiser) {
existing_node->set_priority(priority);
/* Set as advertised to avoid race conditions (otherwise 2 multi-instance advertisers
* could get the same instance).
*/
existing_node->mark_as_advertised();
}
ret = PX4_OK;
} else {
/* otherwise: data has already been published, keep looking */
/* otherwise: already advertised, keep looking */
}
}
} else {
// add to the node map;.
if (is_advertiser) {
node->mark_as_advertised();
}
// add to the node map.
_node_list.add(node);
}

View File

@ -60,7 +60,7 @@ class uORB::DeviceMaster
{
public:
int advertise(const struct orb_metadata *meta, int *instance, int priority);
int advertise(const struct orb_metadata *meta, bool is_advertiser, int *instance, int priority);
/**
* Public interface for getDeviceNodeLocked(). Takes care of synchronization.

View File

@ -90,6 +90,7 @@ uORB::DeviceNode::open(cdev::file_t *filp)
ret = -EBUSY;
}
mark_as_advertised();
unlock();
/* now complete the open */
@ -306,7 +307,6 @@ uORB::DeviceNode::write(cdev::file_t *filp, const char *buffer, size_t buflen)
/* update the timestamp and generation count */
_last_update = hrt_absolute_time();
_advertised = true;
// callbacks
for (auto item : _callbacks) {

View File

@ -164,6 +164,8 @@ public:
* and publish to this node or if another node should be tried. */
bool is_advertised() const { return _advertised; }
void mark_as_advertised() { _advertised = true; }
/**
* Try to change the size of the queue. This can only be done as long as nobody published yet.
* This is the case, for example when orb_subscribe was called before an orb_advertise.

View File

@ -327,14 +327,12 @@ int uORB::Manager::orb_get_interval(int handle, unsigned *interval)
return ret;
}
int uORB::Manager::node_advertise(const struct orb_metadata *meta, int *instance, int priority)
int uORB::Manager::node_advertise(const struct orb_metadata *meta, bool is_advertiser, int *instance, int priority)
{
int ret = PX4_ERROR;
/* fill advertiser data */
if (get_device_master()) {
ret = _device_master->advertise(meta, instance, priority);
ret = _device_master->advertise(meta, is_advertiser, instance, priority);
}
/* it's PX4_OK if it already exists */
@ -384,7 +382,7 @@ int uORB::Manager::node_open(const struct orb_metadata *meta, bool advertiser, i
if (fd < 0) {
/* try to create the node */
ret = node_advertise(meta, instance, priority);
ret = node_advertise(meta, advertiser, instance, priority);
if (ret == PX4_OK) {
/* update the path, as it might have been updated during the node_advertise call */
@ -396,7 +394,7 @@ int uORB::Manager::node_open(const struct orb_metadata *meta, bool advertiser, i
}
}
/* on success, try the open again */
/* on success, try to open again */
if (ret == PX4_OK) {
fd = px4_open(path, (advertiser) ? PX4_F_WRONLY : PX4_F_RDONLY);
}

View File

@ -392,11 +392,9 @@ private: // class methods
/**
* Advertise a node; don't consider it an error if the node has
* already been advertised.
*
* @todo verify that the existing node is the same as the one
* we tried to advertise.
*/
int node_advertise(const struct orb_metadata *meta, int *instance = nullptr, int priority = ORB_PRIO_DEFAULT);
int node_advertise(const struct orb_metadata *meta, bool is_advertiser, int *instance = nullptr,
int priority = ORB_PRIO_DEFAULT);
/**
* Common implementation for orb_advertise and orb_subscribe.