Fix bug where peers could get stuck in sync mode.
There was a case where we had requested a block through the sync mechanism and also received it through the normal inventory mechanism where we would leave the peer in a sync state, but never ask them for more sync blocks. This commit fixes the bug that put us into that stuck state, and also adds code to disconnect peers if we ever manage to get into that stalled state.
This commit is contained in:
parent
c054a92d5e
commit
f66eeeb73b
3 changed files with 289 additions and 190 deletions
|
|
@ -269,6 +269,7 @@ namespace graphene { namespace net
|
|||
fc::thread* _thread;
|
||||
unsigned _send_message_queue_tasks_running; // temporary debugging
|
||||
#endif
|
||||
bool _currently_handling_message; // true while we're in the middle of handling a message from the remote system
|
||||
private:
|
||||
peer_connection(peer_connection_delegate* delegate);
|
||||
void destroy();
|
||||
|
|
@ -299,8 +300,9 @@ namespace graphene { namespace net
|
|||
fc::ip::endpoint get_local_endpoint();
|
||||
void set_remote_endpoint(fc::optional<fc::ip::endpoint> new_remote_endpoint);
|
||||
|
||||
bool busy();
|
||||
bool idle();
|
||||
bool busy() const;
|
||||
bool idle() const;
|
||||
bool is_currently_handling_message() const;
|
||||
|
||||
bool is_transaction_fetching_inhibited() const;
|
||||
fc::sha512 get_shared_secret() const;
|
||||
|
|
|
|||
|
|
@ -1422,6 +1422,19 @@ namespace graphene { namespace net { namespace detail {
|
|||
( "peer", active_peer->get_remote_endpoint() )("timeout", active_send_keepalive_timeout ) );
|
||||
peers_to_send_keep_alive.push_back(active_peer);
|
||||
}
|
||||
else if (active_peer->we_need_sync_items_from_peer &&
|
||||
!active_peer->is_currently_handling_message() &&
|
||||
!active_peer->item_ids_requested_from_peer &&
|
||||
active_peer->ids_of_items_to_get.empty())
|
||||
{
|
||||
// This is a state we should never get into in the first place, but if we do, we should disconnect the peer
|
||||
// to re-establish the connection.
|
||||
fc_wlog(fc::logger::get("sync"), "Disconnecting peer ${peer} because we think we need blocks from them but sync has stalled.",
|
||||
("peer", active_peer->get_remote_endpoint()));
|
||||
wlog("Disconnecting peer ${peer} because we think we need blocks from them but sync has stalled.",
|
||||
("peer", active_peer->get_remote_endpoint()));
|
||||
peers_to_disconnect_forcibly.push_back(active_peer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2516,6 +2529,11 @@ namespace graphene { namespace net { namespace detail {
|
|||
}
|
||||
originating_peer->item_ids_requested_from_peer.reset();
|
||||
|
||||
// if exceptions are throw after clearing the item_ids_requested_from_peer (above),
|
||||
// it could leave our sync in a stalled state. Wrap a try/catch around the rest
|
||||
// of the function so we can log if this ever happens.
|
||||
try
|
||||
{
|
||||
dlog( "sync: received a list of ${count} available items from ${peer_endpoint}",
|
||||
( "count", blockchain_item_ids_inventory_message_received.item_hashes_available.size() )
|
||||
( "peer_endpoint", originating_peer->get_remote_endpoint() ) );
|
||||
|
|
@ -2693,6 +2711,25 @@ namespace graphene { namespace net { namespace detail {
|
|||
}
|
||||
}
|
||||
}
|
||||
catch (const fc::canceled_exception&)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (const fc::exception& e)
|
||||
{
|
||||
elog("Caught unexpected exception: ${e}", ("e", e));
|
||||
assert(false && "exceptions not expected here");
|
||||
}
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
elog("Caught unexpected exception: ${e}", ("e", e.what()));
|
||||
assert(false && "exceptions not expected here");
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
elog("Caught unexpected exception, could break sync operation");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("sync: received a list of sync items available, but I didn't ask for any!");
|
||||
|
|
@ -3267,7 +3304,30 @@ namespace graphene { namespace net { namespace detail {
|
|||
block_processed_this_iteration = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
dlog("Already received and accepted this block (presumably through normal inventory mechanism), treating it as accepted");
|
||||
for (const peer_connection_ptr& peer : _active_connections)
|
||||
{
|
||||
auto items_being_processed_iter = peer->ids_of_items_being_processed.find(received_block_iter->block_id);
|
||||
if (items_being_processed_iter != peer->ids_of_items_being_processed.end())
|
||||
{
|
||||
peer->ids_of_items_being_processed.erase(items_being_processed_iter);
|
||||
dlog("Removed item from ${endpoint}'s list of items being processed, still processing ${len} blocks",
|
||||
("endpoint", peer->get_remote_endpoint())("len", peer->ids_of_items_being_processed.size()));
|
||||
|
||||
// if we just processed the last item in our list from this peer, we will want to
|
||||
// send another request to find out if we are now in sync (this is normally handled in
|
||||
// send_sync_block_to_node_delegate)
|
||||
if (peer->ids_of_items_to_get.empty() &&
|
||||
peer->number_of_unfetched_item_ids == 0 &&
|
||||
peer->ids_of_items_being_processed.empty())
|
||||
{
|
||||
dlog("We received last item in our list for peer ${endpoint}, setup to do a sync check", ("endpoint", peer->get_remote_endpoint()));
|
||||
fetch_next_batch_of_item_ids_from_peer(peer.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break; // start iterating _received_sync_items from the beginning
|
||||
} // end if potential_first_block
|
||||
|
|
@ -3483,6 +3543,11 @@ namespace graphene { namespace net { namespace detail {
|
|||
if (sync_item_iter != originating_peer->sync_items_requested_from_peer.end())
|
||||
{
|
||||
originating_peer->sync_items_requested_from_peer.erase(sync_item_iter);
|
||||
// if exceptions are throw here after removing the sync item from the list (above),
|
||||
// it could leave our sync in a stalled state. Wrap a try/catch around the rest
|
||||
// of the function so we can log if this ever happens.
|
||||
try
|
||||
{
|
||||
_active_sync_requests.erase(block_message_to_process.block_id);
|
||||
process_block_during_sync(originating_peer, block_message_to_process, message_hash);
|
||||
if (originating_peer->idle())
|
||||
|
|
@ -3497,6 +3562,25 @@ namespace graphene { namespace net { namespace detail {
|
|||
}
|
||||
return;
|
||||
}
|
||||
catch (const fc::canceled_exception& e)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (const fc::exception& e)
|
||||
{
|
||||
elog("Caught unexpected exception: ${e}", ("e", e));
|
||||
assert(false && "exceptions not expected here");
|
||||
}
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
elog("Caught unexpected exception: ${e}", ("e", e.what()));
|
||||
assert(false && "exceptions not expected here");
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
elog("Caught unexpected exception, could break sync operation");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we get here, we didn't request the message, we must have a misbehaving peer
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@
|
|||
|
||||
#include <fc/thread/thread.hpp>
|
||||
|
||||
#include <boost/scope_exit.hpp>
|
||||
|
||||
#ifdef DEFAULT_LOGGER
|
||||
# undef DEFAULT_LOGGER
|
||||
#endif
|
||||
|
|
@ -86,11 +88,12 @@ namespace graphene { namespace net
|
|||
inhibit_fetching_sync_blocks(false),
|
||||
transaction_fetching_inhibited_until(fc::time_point::min()),
|
||||
last_known_fork_block_number(0),
|
||||
firewall_check_state(nullptr)
|
||||
firewall_check_state(nullptr),
|
||||
#ifndef NDEBUG
|
||||
,_thread(&fc::thread::current()),
|
||||
_send_message_queue_tasks_running(0)
|
||||
_thread(&fc::thread::current()),
|
||||
_send_message_queue_tasks_running(0),
|
||||
#endif
|
||||
_currently_handling_message(false)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
@ -265,6 +268,10 @@ namespace graphene { namespace net
|
|||
void peer_connection::on_message( message_oriented_connection* originating_connection, const message& received_message )
|
||||
{
|
||||
VERIFY_CORRECT_THREAD();
|
||||
_currently_handling_message = true;
|
||||
BOOST_SCOPE_EXIT(this_) {
|
||||
this_->_currently_handling_message = false;
|
||||
} BOOST_SCOPE_EXIT_END
|
||||
_node->on_message( this, received_message );
|
||||
}
|
||||
|
||||
|
|
@ -438,18 +445,24 @@ namespace graphene { namespace net
|
|||
_remote_endpoint = new_remote_endpoint;
|
||||
}
|
||||
|
||||
bool peer_connection::busy()
|
||||
bool peer_connection::busy() const
|
||||
{
|
||||
VERIFY_CORRECT_THREAD();
|
||||
return !items_requested_from_peer.empty() || !sync_items_requested_from_peer.empty() || item_ids_requested_from_peer;
|
||||
}
|
||||
|
||||
bool peer_connection::idle()
|
||||
bool peer_connection::idle() const
|
||||
{
|
||||
VERIFY_CORRECT_THREAD();
|
||||
return !busy();
|
||||
}
|
||||
|
||||
bool peer_connection::is_currently_handling_message() const
|
||||
{
|
||||
VERIFY_CORRECT_THREAD();
|
||||
return _currently_handling_message;
|
||||
}
|
||||
|
||||
bool peer_connection::is_transaction_fetching_inhibited() const
|
||||
{
|
||||
VERIFY_CORRECT_THREAD();
|
||||
|
|
|
|||
Loading…
Reference in a new issue