Fix bug where peers could get stuck in sync mode.

There was a case where we had requested a block through the sync
mechanism and also received it through the normal inventory mechanism
where we would leave the peer in a sync state, but never ask them
for more sync blocks.

This commit fixes the bug that put us into that stuck state, and also
adds code to disconnect peers if we ever manage to get into that stalled
state.
This commit is contained in:
Eric Frias 2017-08-08 12:05:57 -04:00
parent c054a92d5e
commit f66eeeb73b
3 changed files with 289 additions and 190 deletions

View file

@ -269,6 +269,7 @@ namespace graphene { namespace net
fc::thread* _thread;
unsigned _send_message_queue_tasks_running; // temporary debugging
#endif
bool _currently_handling_message; // true while we're in the middle of handling a message from the remote system
private:
peer_connection(peer_connection_delegate* delegate);
void destroy();
@ -299,8 +300,9 @@ namespace graphene { namespace net
fc::ip::endpoint get_local_endpoint();
void set_remote_endpoint(fc::optional<fc::ip::endpoint> new_remote_endpoint);
bool busy();
bool idle();
bool busy() const;
bool idle() const;
bool is_currently_handling_message() const;
bool is_transaction_fetching_inhibited() const;
fc::sha512 get_shared_secret() const;

View file

@ -1422,6 +1422,19 @@ namespace graphene { namespace net { namespace detail {
( "peer", active_peer->get_remote_endpoint() )("timeout", active_send_keepalive_timeout ) );
peers_to_send_keep_alive.push_back(active_peer);
}
else if (active_peer->we_need_sync_items_from_peer &&
!active_peer->is_currently_handling_message() &&
!active_peer->item_ids_requested_from_peer &&
active_peer->ids_of_items_to_get.empty())
{
// This is a state we should never get into in the first place, but if we do, we should disconnect the peer
// to re-establish the connection.
fc_wlog(fc::logger::get("sync"), "Disconnecting peer ${peer} because we think we need blocks from them but sync has stalled.",
("peer", active_peer->get_remote_endpoint()));
wlog("Disconnecting peer ${peer} because we think we need blocks from them but sync has stalled.",
("peer", active_peer->get_remote_endpoint()));
peers_to_disconnect_forcibly.push_back(active_peer);
}
}
}
@ -2516,6 +2529,11 @@ namespace graphene { namespace net { namespace detail {
}
originating_peer->item_ids_requested_from_peer.reset();
// if exceptions are throw after clearing the item_ids_requested_from_peer (above),
// it could leave our sync in a stalled state. Wrap a try/catch around the rest
// of the function so we can log if this ever happens.
try
{
dlog( "sync: received a list of ${count} available items from ${peer_endpoint}",
( "count", blockchain_item_ids_inventory_message_received.item_hashes_available.size() )
( "peer_endpoint", originating_peer->get_remote_endpoint() ) );
@ -2693,6 +2711,25 @@ namespace graphene { namespace net { namespace detail {
}
}
}
catch (const fc::canceled_exception&)
{
throw;
}
catch (const fc::exception& e)
{
elog("Caught unexpected exception: ${e}", ("e", e));
assert(false && "exceptions not expected here");
}
catch (const std::exception& e)
{
elog("Caught unexpected exception: ${e}", ("e", e.what()));
assert(false && "exceptions not expected here");
}
catch (...)
{
elog("Caught unexpected exception, could break sync operation");
}
}
else
{
wlog("sync: received a list of sync items available, but I didn't ask for any!");
@ -3267,7 +3304,30 @@ namespace graphene { namespace net { namespace detail {
block_processed_this_iteration = true;
}
else
{
dlog("Already received and accepted this block (presumably through normal inventory mechanism), treating it as accepted");
for (const peer_connection_ptr& peer : _active_connections)
{
auto items_being_processed_iter = peer->ids_of_items_being_processed.find(received_block_iter->block_id);
if (items_being_processed_iter != peer->ids_of_items_being_processed.end())
{
peer->ids_of_items_being_processed.erase(items_being_processed_iter);
dlog("Removed item from ${endpoint}'s list of items being processed, still processing ${len} blocks",
("endpoint", peer->get_remote_endpoint())("len", peer->ids_of_items_being_processed.size()));
// if we just processed the last item in our list from this peer, we will want to
// send another request to find out if we are now in sync (this is normally handled in
// send_sync_block_to_node_delegate)
if (peer->ids_of_items_to_get.empty() &&
peer->number_of_unfetched_item_ids == 0 &&
peer->ids_of_items_being_processed.empty())
{
dlog("We received last item in our list for peer ${endpoint}, setup to do a sync check", ("endpoint", peer->get_remote_endpoint()));
fetch_next_batch_of_item_ids_from_peer(peer.get());
}
}
}
}
break; // start iterating _received_sync_items from the beginning
} // end if potential_first_block
@ -3483,6 +3543,11 @@ namespace graphene { namespace net { namespace detail {
if (sync_item_iter != originating_peer->sync_items_requested_from_peer.end())
{
originating_peer->sync_items_requested_from_peer.erase(sync_item_iter);
// if exceptions are throw here after removing the sync item from the list (above),
// it could leave our sync in a stalled state. Wrap a try/catch around the rest
// of the function so we can log if this ever happens.
try
{
_active_sync_requests.erase(block_message_to_process.block_id);
process_block_during_sync(originating_peer, block_message_to_process, message_hash);
if (originating_peer->idle())
@ -3497,6 +3562,25 @@ namespace graphene { namespace net { namespace detail {
}
return;
}
catch (const fc::canceled_exception& e)
{
throw;
}
catch (const fc::exception& e)
{
elog("Caught unexpected exception: ${e}", ("e", e));
assert(false && "exceptions not expected here");
}
catch (const std::exception& e)
{
elog("Caught unexpected exception: ${e}", ("e", e.what()));
assert(false && "exceptions not expected here");
}
catch (...)
{
elog("Caught unexpected exception, could break sync operation");
}
}
}
// if we get here, we didn't request the message, we must have a misbehaving peer

View file

@ -29,6 +29,8 @@
#include <fc/thread/thread.hpp>
#include <boost/scope_exit.hpp>
#ifdef DEFAULT_LOGGER
# undef DEFAULT_LOGGER
#endif
@ -86,11 +88,12 @@ namespace graphene { namespace net
inhibit_fetching_sync_blocks(false),
transaction_fetching_inhibited_until(fc::time_point::min()),
last_known_fork_block_number(0),
firewall_check_state(nullptr)
firewall_check_state(nullptr),
#ifndef NDEBUG
,_thread(&fc::thread::current()),
_send_message_queue_tasks_running(0)
_thread(&fc::thread::current()),
_send_message_queue_tasks_running(0),
#endif
_currently_handling_message(false)
{
}
@ -265,6 +268,10 @@ namespace graphene { namespace net
void peer_connection::on_message( message_oriented_connection* originating_connection, const message& received_message )
{
VERIFY_CORRECT_THREAD();
_currently_handling_message = true;
BOOST_SCOPE_EXIT(this_) {
this_->_currently_handling_message = false;
} BOOST_SCOPE_EXIT_END
_node->on_message( this, received_message );
}
@ -438,18 +445,24 @@ namespace graphene { namespace net
_remote_endpoint = new_remote_endpoint;
}
bool peer_connection::busy()
bool peer_connection::busy() const
{
VERIFY_CORRECT_THREAD();
return !items_requested_from_peer.empty() || !sync_items_requested_from_peer.empty() || item_ids_requested_from_peer;
}
bool peer_connection::idle()
bool peer_connection::idle() const
{
VERIFY_CORRECT_THREAD();
return !busy();
}
bool peer_connection::is_currently_handling_message() const
{
VERIFY_CORRECT_THREAD();
return _currently_handling_message;
}
bool peer_connection::is_transaction_fetching_inhibited() const
{
VERIFY_CORRECT_THREAD();