Commit 0e55212d authored by Phoebe Buckheister's avatar Phoebe Buckheister

Merge branch 'mod-sync-early-abort-v6' into 'v6'

Mod sync early abort v6

See merge request beegfs/projects0!808
parents 76c77829 50789a02
...@@ -32,10 +32,10 @@ BuddyResyncJob::BuddyResyncJob() : ...@@ -32,10 +32,10 @@ BuddyResyncJob::BuddyResyncJob() :
for (size_t i = 0; i < numSyncSlaves; i++) for (size_t i = 0; i < numSyncSlaves; i++)
bulkSyncSlaves.emplace_back( bulkSyncSlaves.emplace_back(
boost::make_unique<BuddyResyncerBulkSyncSlave>(&syncCandidates, i, buddyNodeID, *this)); boost::make_unique<BuddyResyncerBulkSyncSlave>(*this, &syncCandidates, i, buddyNodeID));
sessionStoreResyncer = boost::make_unique<SessionStoreResyncer>(buddyNodeID); sessionStoreResyncer = boost::make_unique<SessionStoreResyncer>(buddyNodeID);
modSyncSlave = boost::make_unique<BuddyResyncerModSyncSlave>(&syncCandidates, 1, buddyNodeID); modSyncSlave = boost::make_unique<BuddyResyncerModSyncSlave>(*this, &syncCandidates, 1, buddyNodeID);
} }
BuddyResyncJob::~BuddyResyncJob() = default; BuddyResyncJob::~BuddyResyncJob() = default;
......
...@@ -12,10 +12,11 @@ ...@@ -12,10 +12,11 @@
#include <dirent.h> #include <dirent.h>
BuddyResyncerBulkSyncSlave::BuddyResyncerBulkSyncSlave(MetaSyncCandidateStore* syncCandidates, BuddyResyncerBulkSyncSlave::BuddyResyncerBulkSyncSlave(BuddyResyncJob& parentJob,
uint8_t slaveID, const NumNodeID& buddyNodeID, BuddyResyncJob& parentJob) : MetaSyncCandidateStore* syncCandidates, uint8_t slaveID, const NumNodeID& buddyNodeID) :
SyncSlaveBase("BuddyResyncerBulkSyncSlave_" + StringTk::uintToStr(slaveID), buddyNodeID), SyncSlaveBase("BuddyResyncerBulkSyncSlave_" + StringTk::uintToStr(slaveID), parentJob,
syncCandidates(syncCandidates), parentJob(&parentJob) buddyNodeID),
syncCandidates(syncCandidates)
{ {
} }
......
...@@ -18,8 +18,8 @@ class BuddyResyncerBulkSyncSlave : public SyncSlaveBase ...@@ -18,8 +18,8 @@ class BuddyResyncerBulkSyncSlave : public SyncSlaveBase
friend class BuddyResyncJob; friend class BuddyResyncJob;
public: public:
BuddyResyncerBulkSyncSlave(MetaSyncCandidateStore* syncCanditates, uint8_t slaveID, BuddyResyncerBulkSyncSlave(BuddyResyncJob& parentJob, MetaSyncCandidateStore* syncCanditates, uint8_t slaveID,
const NumNodeID& buddyNodeID, BuddyResyncJob& parentJob); const NumNodeID& buddyNodeID);
struct Stats struct Stats
{ {
...@@ -37,7 +37,6 @@ class BuddyResyncerBulkSyncSlave : public SyncSlaveBase ...@@ -37,7 +37,6 @@ class BuddyResyncerBulkSyncSlave : public SyncSlaveBase
private: private:
MetaSyncCandidateStore* syncCandidates; MetaSyncCandidateStore* syncCandidates;
BuddyResyncJob* parentJob;
AtomicUInt64 numDirsSynced; AtomicUInt64 numDirsSynced;
AtomicUInt64 numFilesSynced; AtomicUInt64 numFilesSynced;
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <common/net/message/storage/mirroring/ResyncRawInodesRespMsg.h> #include <common/net/message/storage/mirroring/ResyncRawInodesRespMsg.h>
#include <common/toolkit/StringTk.h> #include <common/toolkit/StringTk.h>
#include <common/toolkit/MessagingTk.h> #include <common/toolkit/MessagingTk.h>
#include <common/toolkit/DebugVariable.h>
#include <common/Common.h> #include <common/Common.h>
#include <net/message/storage/mirroring/ResyncRawInodesMsgEx.h> #include <net/message/storage/mirroring/ResyncRawInodesMsgEx.h>
...@@ -10,9 +11,10 @@ ...@@ -10,9 +11,10 @@
#include <program/Program.h> #include <program/Program.h>
#include <toolkit/XAttrTk.h> #include <toolkit/XAttrTk.h>
BuddyResyncerModSyncSlave::BuddyResyncerModSyncSlave(MetaSyncCandidateStore* syncCandidates, BuddyResyncerModSyncSlave::BuddyResyncerModSyncSlave(BuddyResyncJob& parentJob,
uint8_t slaveID, const NumNodeID& buddyNodeID) : MetaSyncCandidateStore* syncCandidates, uint8_t slaveID, const NumNodeID& buddyNodeID) :
SyncSlaveBase("BuddyResyncerModSyncSlave_" + StringTk::uintToStr(slaveID), buddyNodeID), SyncSlaveBase("BuddyResyncerModSyncSlave_" + StringTk::uintToStr(slaveID), parentJob,
buddyNodeID),
syncCandidates(syncCandidates) syncCandidates(syncCandidates)
{ {
} }
...@@ -60,6 +62,8 @@ bool resyncElemCmp(const MetaSyncCandidateFile::Element& a, const MetaSyncCandid ...@@ -60,6 +62,8 @@ bool resyncElemCmp(const MetaSyncCandidateFile::Element& a, const MetaSyncCandid
FhgfsOpsErr BuddyResyncerModSyncSlave::streamCandidates(Socket& socket) FhgfsOpsErr BuddyResyncerModSyncSlave::streamCandidates(Socket& socket)
{ {
DEBUG_ENV_VAR(unsigned, DEBUG_FAIL_MODSYNC, 0, "BEEGFS_DEBUG_FAIL_MODSYNC");
while (!getSelfTerminateNotIdle()) while (!getSelfTerminateNotIdle())
{ {
if (syncCandidates->isFilesEmpty()) if (syncCandidates->isFilesEmpty())
...@@ -109,10 +113,15 @@ FhgfsOpsErr BuddyResyncerModSyncSlave::streamCandidates(Socket& socket) ...@@ -109,10 +113,15 @@ FhgfsOpsErr BuddyResyncerModSyncSlave::streamCandidates(Socket& socket)
return FhgfsOpsErr_INTERNAL; return FhgfsOpsErr_INTERNAL;
} }
if (resyncRes != FhgfsOpsErr_SUCCESS) if (resyncRes != FhgfsOpsErr_SUCCESS || DEBUG_FAIL_MODSYNC)
{ {
LOG(ERR, "Modification resync failed.", element.path, element.isDeletion, resyncRes); LOG(ERR, "Modification resync failed.", element.path, element.isDeletion, resyncRes);
numErrors.increase(); numErrors.increase();
// Since this error prevents the resync from reaching a GOOD state on the secondary,
// we abort here.
parentJob->abort();
// terminate the current stream, start a new one if necessary. we could (in theory) // terminate the current stream, start a new one if necessary. we could (in theory)
// reuse the current stream, but terminating a stream that has seen an error is simpler // reuse the current stream, but terminating a stream that has seen an error is simpler
// to handle than keeping it open. also, bulk resync would like "fail on error" // to handle than keeping it open. also, bulk resync would like "fail on error"
......
...@@ -18,8 +18,8 @@ class BuddyResyncerModSyncSlave : public SyncSlaveBase ...@@ -18,8 +18,8 @@ class BuddyResyncerModSyncSlave : public SyncSlaveBase
friend class BuddyResyncJob; friend class BuddyResyncJob;
public: public:
BuddyResyncerModSyncSlave(MetaSyncCandidateStore* syncCanditates, uint8_t slaveID, BuddyResyncerModSyncSlave(BuddyResyncJob& parentJob, MetaSyncCandidateStore* syncCanditates,
const NumNodeID& buddyNodeID); uint8_t slaveID, const NumNodeID& buddyNodeID);
struct Stats struct Stats
{ {
......
...@@ -28,6 +28,8 @@ class SyncSlaveBase : public PThread ...@@ -28,6 +28,8 @@ class SyncSlaveBase : public PThread
} }
protected: protected:
BuddyResyncJob* parentJob;
NumNodeID buddyNodeID; NumNodeID buddyNodeID;
Mutex stateMutex; Mutex stateMutex;
...@@ -39,8 +41,9 @@ class SyncSlaveBase : public PThread ...@@ -39,8 +41,9 @@ class SyncSlaveBase : public PThread
Path basePath; Path basePath;
SyncSlaveBase(const std::string& threadName, const NumNodeID buddyNodeID): SyncSlaveBase(const std::string& threadName, BuddyResyncJob& parentJob,
PThread(threadName), buddyNodeID(buddyNodeID), isRunning(false) const NumNodeID buddyNodeID):
PThread(threadName), parentJob(&parentJob), buddyNodeID(buddyNodeID), isRunning(false)
{ {
} }
......
mount = node.properties["client0"]["mount"]
on node do cd mount do
# how long shall the test wait for the resync to start and fail?
timeout = 60
done = false
# Create some files to keep resync running long enough for modsync to fail.
shell "for i in `seq 1 10000`; do touch file_$i; done"
parallel do
# Task that creates work for modsync
sequential do
t1 = Time.now
while Time.now < t1 + timeout and not done do
shell "touch file_1"
sleep 0.1
end
end
# Task that starts and tests the resync and modsync abort.
sequential do
shell "sudo #{$binaries[:ctl]} --startresync --nodetype=meta --mirrorgroupid=101"
shell "sudo #{$binaries[:ctl]} --startresync --nodetype=meta --mirrorgroupid=101"
begin
wait_daemon_log(node, "meta0", "Modification resync failed.", timeout)
rescue
raise "Modification resync was expected to fail and abort immediately but didn't."
end
done = true # Enough modifications done to trigger failure
# wait a moment to make sure the --resyncstats has been updated
sleep 10
# check if resync has actually stopped
raise "Modification resync did not complete with errors." if
not shell? "sudo #{$binaries[:ctl]} --resyncstats --nodetype=meta \
--mirrorgroupid=101 | grep -i \"completed with errors\""
# modsync should fail immediately, so 0 objects synced, 1 error
raise "Modification resync should not have synced any objects." if
not shell? "sudo #{$binaries[:ctl]} --resyncstats --nodetype=meta \
--mirrorgroupid=101 | grep -i \"modification objects synced: 0\""
raise "Modification resync completed with 0 errors, should have been at least 1." if
shell? "sudo #{$binaries[:ctl]} --resyncstats --nodetype=meta \
--mirrorgroupid=101 | grep -Ei \"modification sync errors: 0\""
end
end
end end # on node cd mount
hosts:
node:
- env:
BEEGFS_DEBUG_FAIL_MODSYNC: 1
- mgmtd
- meta:
id: 1
- meta:
id: 2
- storage:
id: 3
- helperd
- client
mirroring:
meta_groups:
- id: 101
primary: 1
secondary: 2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment