...
 
Commits (5)
......@@ -32,10 +32,10 @@ BuddyResyncJob::BuddyResyncJob() :
for (size_t i = 0; i < numSyncSlaves; i++)
bulkSyncSlaves.emplace_back(
boost::make_unique<BuddyResyncerBulkSyncSlave>(&syncCandidates, i, buddyNodeID, *this));
boost::make_unique<BuddyResyncerBulkSyncSlave>(*this, &syncCandidates, i, buddyNodeID));
sessionStoreResyncer = boost::make_unique<SessionStoreResyncer>(buddyNodeID);
modSyncSlave = boost::make_unique<BuddyResyncerModSyncSlave>(&syncCandidates, 1, buddyNodeID);
modSyncSlave = boost::make_unique<BuddyResyncerModSyncSlave>(*this, &syncCandidates, 1, buddyNodeID);
}
BuddyResyncJob::~BuddyResyncJob() = default;
......
......@@ -12,10 +12,11 @@
#include <dirent.h>
BuddyResyncerBulkSyncSlave::BuddyResyncerBulkSyncSlave(MetaSyncCandidateStore* syncCandidates,
uint8_t slaveID, const NumNodeID& buddyNodeID, BuddyResyncJob& parentJob) :
SyncSlaveBase("BuddyResyncerBulkSyncSlave_" + StringTk::uintToStr(slaveID), buddyNodeID),
syncCandidates(syncCandidates), parentJob(&parentJob)
BuddyResyncerBulkSyncSlave::BuddyResyncerBulkSyncSlave(BuddyResyncJob& parentJob,
MetaSyncCandidateStore* syncCandidates, uint8_t slaveID, const NumNodeID& buddyNodeID) :
SyncSlaveBase("BuddyResyncerBulkSyncSlave_" + StringTk::uintToStr(slaveID), parentJob,
buddyNodeID),
syncCandidates(syncCandidates)
{
}
......
......@@ -18,8 +18,8 @@ class BuddyResyncerBulkSyncSlave : public SyncSlaveBase
friend class BuddyResyncJob;
public:
BuddyResyncerBulkSyncSlave(MetaSyncCandidateStore* syncCanditates, uint8_t slaveID,
const NumNodeID& buddyNodeID, BuddyResyncJob& parentJob);
BuddyResyncerBulkSyncSlave(BuddyResyncJob& parentJob, MetaSyncCandidateStore* syncCanditates, uint8_t slaveID,
const NumNodeID& buddyNodeID);
struct Stats
{
......@@ -37,7 +37,6 @@ class BuddyResyncerBulkSyncSlave : public SyncSlaveBase
private:
MetaSyncCandidateStore* syncCandidates;
BuddyResyncJob* parentJob;
AtomicUInt64 numDirsSynced;
AtomicUInt64 numFilesSynced;
......
......@@ -3,6 +3,7 @@
#include <common/net/message/storage/mirroring/ResyncRawInodesRespMsg.h>
#include <common/toolkit/StringTk.h>
#include <common/toolkit/MessagingTk.h>
#include <common/toolkit/DebugVariable.h>
#include <common/Common.h>
#include <net/message/storage/mirroring/ResyncRawInodesMsgEx.h>
......@@ -10,9 +11,10 @@
#include <program/Program.h>
#include <toolkit/XAttrTk.h>
BuddyResyncerModSyncSlave::BuddyResyncerModSyncSlave(MetaSyncCandidateStore* syncCandidates,
uint8_t slaveID, const NumNodeID& buddyNodeID) :
SyncSlaveBase("BuddyResyncerModSyncSlave_" + StringTk::uintToStr(slaveID), buddyNodeID),
BuddyResyncerModSyncSlave::BuddyResyncerModSyncSlave(BuddyResyncJob& parentJob,
MetaSyncCandidateStore* syncCandidates, uint8_t slaveID, const NumNodeID& buddyNodeID) :
SyncSlaveBase("BuddyResyncerModSyncSlave_" + StringTk::uintToStr(slaveID), parentJob,
buddyNodeID),
syncCandidates(syncCandidates)
{
}
......@@ -60,6 +62,8 @@ bool resyncElemCmp(const MetaSyncCandidateFile::Element& a, const MetaSyncCandid
FhgfsOpsErr BuddyResyncerModSyncSlave::streamCandidates(Socket& socket)
{
DEBUG_ENV_VAR(unsigned, DEBUG_FAIL_MODSYNC, 0, "BEEGFS_DEBUG_FAIL_MODSYNC");
while (!getSelfTerminateNotIdle())
{
if (syncCandidates->isFilesEmpty())
......@@ -109,10 +113,15 @@ FhgfsOpsErr BuddyResyncerModSyncSlave::streamCandidates(Socket& socket)
return FhgfsOpsErr_INTERNAL;
}
if (resyncRes != FhgfsOpsErr_SUCCESS)
if (resyncRes != FhgfsOpsErr_SUCCESS || DEBUG_FAIL_MODSYNC)
{
LOG(ERR, "Modification resync failed.", element.path, element.isDeletion, resyncRes);
numErrors.increase();
// Since this error prevents the resync from reaching a GOOD state on the secondary,
// we abort here.
parentJob->abort();
// terminate the current stream, start a new one if necessary. we could (in theory)
// reuse the current stream, but terminating a stream that has seen an error is simpler
// to handle than keeping it open. also, bulk resync would like "fail on error"
......
......@@ -18,8 +18,8 @@ class BuddyResyncerModSyncSlave : public SyncSlaveBase
friend class BuddyResyncJob;
public:
BuddyResyncerModSyncSlave(MetaSyncCandidateStore* syncCanditates, uint8_t slaveID,
const NumNodeID& buddyNodeID);
BuddyResyncerModSyncSlave(BuddyResyncJob& parentJob, MetaSyncCandidateStore* syncCanditates,
uint8_t slaveID, const NumNodeID& buddyNodeID);
struct Stats
{
......
......@@ -28,6 +28,8 @@ class SyncSlaveBase : public PThread
}
protected:
BuddyResyncJob* parentJob;
NumNodeID buddyNodeID;
Mutex stateMutex;
......@@ -39,8 +41,9 @@ class SyncSlaveBase : public PThread
Path basePath;
SyncSlaveBase(const std::string& threadName, const NumNodeID buddyNodeID):
PThread(threadName), buddyNodeID(buddyNodeID), isRunning(false)
SyncSlaveBase(const std::string& threadName, BuddyResyncJob& parentJob,
const NumNodeID buddyNodeID):
PThread(threadName), parentJob(&parentJob), buddyNodeID(buddyNodeID), isRunning(false)
{
}
......
mount = node.properties["client0"]["mount"]
on node do cd mount do
# how long shall the test wait for the resync to start and fail?
timeout = 60
done = false
# Create some files to keep resync running long enough for modsync to fail.
shell "for i in `seq 1 10000`; do touch file_$i; done"
parallel do
# Task that creates work for modsync
sequential do
t1 = Time.now
while Time.now < t1 + timeout and not done do
shell "touch file_1"
sleep 0.1
end
end
# Task that starts and tests the resync and modsync abort.
sequential do
shell "sudo #{$binaries[:ctl]} --startresync --nodetype=meta --mirrorgroupid=101"
shell "sudo #{$binaries[:ctl]} --startresync --nodetype=meta --mirrorgroupid=101"
begin
wait_daemon_log(node, "meta0", "Modification resync failed.", timeout)
rescue
raise "Modification resync was expected to fail and abort immediately but didn't."
end
done = true # Enough modifications done to trigger failure
# wait a moment to make sure the --resyncstats has been updated
sleep 10
# check if resync has actually stopped
raise "Modification resync did not complete with errors." if
not shell? "sudo #{$binaries[:ctl]} --resyncstats --nodetype=meta \
--mirrorgroupid=101 | grep -i \"completed with errors\""
# modsync should fail immediately, so 0 objects synced, 1 error
raise "Modification resync should not have synced any objects." if
not shell? "sudo #{$binaries[:ctl]} --resyncstats --nodetype=meta \
--mirrorgroupid=101 | grep -i \"modification objects synced: 0\""
raise "Modification resync completed with 0 errors, should have been at least 1." if
shell? "sudo #{$binaries[:ctl]} --resyncstats --nodetype=meta \
--mirrorgroupid=101 | grep -Ei \"modification sync errors: 0\""
end
end
end end # on node cd mount
hosts:
node:
- env:
BEEGFS_DEBUG_FAIL_MODSYNC: 1
- mgmtd
- meta:
id: 1
- meta:
id: 2
- storage:
id: 3
- helperd
- client
mirroring:
meta_groups:
- id: 101
primary: 1
secondary: 2