reorder the p2p network's terminate_inactive_connections_loop() to prevent us

from being interrupted before we delete an unresponsive peer #288
2015-09-02 12:15:02 -04:00 · 2015-09-02 12:15:02 -04:00 · d35e2d6d4e
commit d35e2d6d4e
parent dd04bab521
1 changed files with 123 additions and 104 deletions
--- a/libraries/net/node.cpp
+++ b/libraries/net/node.cpp
@ -1253,6 +1253,8 @@ namespace graphene { namespace net { namespace detail {
      std::list<peer_connection_ptr> peers_to_send_keep_alive;
      std::list<peer_connection_ptr> peers_to_terminate;
      uint8_t current_block_interval_in_seconds = _delegate->get_current_block_interval_in_seconds();
      // Disconnect peers that haven't sent us any data recently
      // These numbers are just guesses and we need to think through how this works better.
      // If we and our peers get disconnected from the rest of the network, we will not
@ -1261,6 +1263,11 @@ namespace graphene { namespace net { namespace detail {
      // them (but they won't have sent us anything since they aren't getting blocks either).
      // This might not be so bad because it could make us initiate more connections and
      // reconnect with the rest of the network, or it might just futher isolate us.
      {
        // As usual, the first step is to walk through all our peers and figure out which
        // peers need action (disconneting, sending keepalives, etc), then we walk through 
        // those lists yielding at our leisure later.
        ASSERT_TASK_NOT_PREEMPTED();
        uint32_t handshaking_timeout = _peer_inactivity_timeout;
        fc::time_point handshaking_disconnect_threshold = fc::time_point::now() - fc::seconds(handshaking_timeout);
@ -1285,7 +1292,6 @@ namespace graphene { namespace net { namespace detail {
          }
        // timeout for any active peers is two block intervals
      uint8_t current_block_interval_in_seconds = _delegate->get_current_block_interval_in_seconds();
        uint32_t active_disconnect_timeout = 10 * current_block_interval_in_seconds;
        uint32_t active_send_keepalive_timeount = active_disconnect_timeout / 2;
        uint32_t active_ignored_request_timeount = 3 * current_block_interval_in_seconds;
@ -1368,6 +1374,33 @@ namespace graphene { namespace net { namespace detail {
            peers_to_terminate.push_back(peer);
          }
        // That's the end of the sorting step; now all peers that require further processing are now in one of the
        // lists peers_to_disconnect_gently,  peers_to_disconnect_forcibly, peers_to_send_keep_alive, or peers_to_terminate
        // if we've decided to delete any peers, do it now; in its current implementation this doesn't yield,
        // and once we start yielding, we may find that we've moved that peer to another list (closed or active)
        // and that triggers assertions, maybe even errors
        for (const peer_connection_ptr& peer : peers_to_terminate )
        {
          assert(_terminating_connections.find(peer) != _terminating_connections.end());
          _terminating_connections.erase(peer);
          schedule_peer_for_deletion(peer);
        }
        peers_to_terminate.clear();
        // if we're going to abruptly disconnect anyone, do it here 
        // (it doesn't yield).  I don't think there would be any harm if this were 
        // moved to the yielding section
        for( const peer_connection_ptr& peer : peers_to_disconnect_forcibly )
        {
          move_peer_to_terminating_list(peer);
          peer->close_connection();
        }
        peers_to_disconnect_forcibly.clear();
      } // end ASSERT_TASK_NOT_PREEMPTED()
      // Now process the peers that we need to do yielding functions with (disconnect sends a message with the
      // disconnect reason, so it may yield)
      for( const peer_connection_ptr& peer : peers_to_disconnect_gently )
      {
        fc::exception detailed_error( FC_LOG_MESSAGE(warn, "Disconnecting due to inactivity",
@ -1378,25 +1411,11 @@ namespace graphene { namespace net { namespace detail {
      }
      peers_to_disconnect_gently.clear();
      for( const peer_connection_ptr& peer : peers_to_disconnect_forcibly )
      {
        move_peer_to_terminating_list(peer);
        peer->close_connection();
      }
      peers_to_disconnect_forcibly.clear();
      for( const peer_connection_ptr& peer : peers_to_send_keep_alive )
        peer->send_message(current_time_request_message(),
                           offsetof(current_time_request_message, request_sent_time));
      peers_to_send_keep_alive.clear();
      for (const peer_connection_ptr& peer : peers_to_terminate )
      {
        assert(_terminating_connections.find(peer) != _terminating_connections.end());
        _terminating_connections.erase(peer);
        schedule_peer_for_deletion(peer);
      }
      if (!_node_is_shutting_down && !_terminate_inactive_connections_loop_done.canceled())
         _terminate_inactive_connections_loop_done = fc::schedule( [this](){ terminate_inactive_connections_loop(); },
                                                                   fc::time_point::now() + fc::seconds(GRAPHENE_NET_PEER_HANDSHAKE_INACTIVITY_TIMEOUT / 2),