Commit 93d2a1ff authored by Phoebe Buckheister's avatar Phoebe Buckheister 🦎

meta: lock inode/dentry hash dirs during resync

mkdir() and rmdir() during resync can cause races between bulk sync and
mod sync that can lead to failed resyncs or even data loss.

if a client runs mkdir() that creates an inode in inodes/$L1/$L2 after
bulk sync for inodes/$L1/$L2 has synced all inodes, but before (or
while) bulk sync is removing all inodes it has not transferred itself,
the new inode may be erroneously deleted. if a client runs rmdir()
instead, the deleted inode may cause a resync failure when bulk sync
tries to remove a file that is no longer present.

(cherry picked from commit a0c307c513a7697be03ddd0c203c6e612a936053)
parent 0e22941a
......@@ -142,7 +142,7 @@ bool LocalConnWorker::processIncomingData(
return false;
}
NetMessage::ResponseContext rctx(NULL, workerEndpoint, bufOut, bufOutLen, &stats);
NetMessage::ResponseContext rctx(NULL, workerEndpoint, bufOut, bufOutLen, &stats, true);
bool processRes = msg->processIncoming(rctx);
if(!processRes)
{
......
......@@ -125,9 +125,9 @@ class NetMessage
public:
ResponseContext(struct sockaddr_in* fromAddr, Socket* sock, char* respBuf,
unsigned bufLen, HighResolutionStats* stats)
unsigned bufLen, HighResolutionStats* stats, bool locallyGenerated = false)
: fromAddr(fromAddr), socket(sock), responseBuffer(respBuf),
responseBufferLength(bufLen), stats(stats)
responseBufferLength(bufLen), stats(stats), locallyGenerated(locallyGenerated)
{}
void sendResponse(const NetMessage& response) const
......@@ -148,12 +148,15 @@ class NetMessage
return fromAddr ? Socket::ipaddrToStr(&fromAddr->sin_addr) : socket->getPeername();
}
bool isLocallyGenerated() const { return locallyGenerated; }
private:
struct sockaddr_in* fromAddr;
Socket* socket;
char* responseBuffer;
unsigned responseBufferLength;
HighResolutionStats* stats;
bool locallyGenerated;
};
/**
......
......@@ -27,6 +27,18 @@ class MetaStorageTk
META_INODES_LEVEL1_SUBDIR_NUM, META_INODES_LEVEL2_SUBDIR_NUM);
}
/**
* Get directory for a (non-inlined) inode.
*
* @param inodePath path to inodes subdir of general storage directory
* @param fileName entryID of the inode
*/
static std::pair<unsigned, unsigned> getMetaInodeHash(const std::string& fileName)
{
return StorageTk::getHash(fileName,
META_INODES_LEVEL1_SUBDIR_NUM, META_INODES_LEVEL2_SUBDIR_NUM);
}
/**
* Get path to a contents directory (i.e. the dir containing the dentries by name).
*
......
......@@ -384,6 +384,20 @@ class StorageTk
return path + "/" + getBaseHashPath(entryID, numHashesLevel1, numHashesLevel2);
}
/**
* @return (hashDir1, hashDir2)
*/
static std::pair<unsigned, unsigned> getHash(const std::string& entryID,
size_t numHashesLevel1, size_t numHashesLevel2)
{
uint16_t hashLevel1;
uint16_t hashLevel2;
getHashes(entryID, numHashesLevel1, numHashesLevel2, hashLevel1, hashLevel2);
return {hashLevel1, hashLevel2};
}
/**
* @return hashDir1/hashDir2/entryID
*/
......
......@@ -40,6 +40,14 @@ void BuddyResyncerBulkSyncSlave::syncLoop()
if (candidate.getType() == MetaSyncDirType::InodesHashDir ||
candidate.getType() == MetaSyncDirType::DentriesHashDir)
{
// lock the hash path in accordance with MkLocalDir, RmLocalDir and RmDir.
const auto& hashDir = candidate.getRelativePath();
auto slash1 = hashDir.find('/');
auto slash2 = hashDir.find('/', slash1 + 1);
auto hash1 = StringTk::strHexToUInt(hashDir.substr(slash1 + 1, slash2 - slash1 - 1));
auto hash2 = StringTk::strHexToUInt(hashDir.substr(slash2 + 1));
HashDirLock hashLock = {lockStore, {hash1, hash2}};
const FhgfsOpsErr resyncRes = resyncDirectory(candidate, "");
if (resyncRes == FhgfsOpsErr_SUCCESS)
continue;
......
......@@ -23,9 +23,12 @@ class MirroredMessage : public BaseT
protected:
typedef MirroredMessage BaseType;
BuddyResyncJob* resyncJob;
LockStateT lockState;
MirroredMessage() {}
MirroredMessage():
resyncJob(nullptr)
{}
virtual FhgfsOpsErr processSecondaryResponse(NetMessage& resp) = 0;
......@@ -38,10 +41,12 @@ class MirroredMessage : public BaseT
// IMPORTANT NOTE ON LOCKING ORDER:
// * always take locks the order
// - HashDirLock
// - DirIDLock
// - ParentNameLock
// - FileIDLock
// * always take locks of each type with the order induced by:
// - HashDirLock: id
// - DirIDLock: (id, forWrite)
// - ParentNameLock: (parentID, name)
// - FileIDLock: id
......@@ -55,7 +60,6 @@ class MirroredMessage : public BaseT
{
Session* session = nullptr;
bool isNewState = true;
BuddyResyncJob* resyncJob = nullptr;
if (isMirrored() && !this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond))
{
......
......@@ -21,6 +21,8 @@ bool MkDirMsgEx::processIncoming(ResponseContext& ctx)
App* app = Program::getApp();
entryID = StorageTk::generateFileID(app->getLocalNode().getNumID());
BaseType::processIncoming(ctx);
// update operation counters
......@@ -46,12 +48,20 @@ std::unique_ptr<MirroredMessageResponseState> MkDirMsgEx::executeLocally(Respons
return std::move(result);
}
std::tuple<DirIDLock, ParentNameLock> MkDirMsgEx::lock(EntryLockStore& store)
std::tuple<HashDirLock, DirIDLock, ParentNameLock> MkDirMsgEx::lock(EntryLockStore& store)
{
HashDirLock hashLock;
// during resync we must lock the hash dir of the new inode even if the inode will be created on
// a different node because we select the target node for the inode only after we have locked our
// structures.
if (resyncJob && resyncJob->isRunning())
hashLock = {&store, MetaStorageTk::getMetaInodeHash(entryID)};
DirIDLock dirLock(&store, getParentInfo()->getEntryID(), true);
ParentNameLock dentryLock(&store, getParentInfo()->getEntryID(), getNewDirName());
return std::make_tuple(std::move(dirLock), std::move(dentryLock));
return std::make_tuple(std::move(hashLock), std::move(dirLock), std::move(dentryLock));
}
std::unique_ptr<MkDirMsgEx::ResponseState> MkDirMsgEx::mkDirPrimary(ResponseContext& ctx)
......@@ -132,7 +142,6 @@ std::unique_ptr<MkDirMsgEx::ResponseState> MkDirMsgEx::mkDirPrimary(ResponseCont
const uint16_t ownerNodeID = newOwnerNodes[0];
const std::string parentEntryID = parentInfo->getEntryID();
const std::string entryID = StorageTk::generateFileID(app->getLocalNode().getNumID());
int entryInfoFlags = isBuddyMirrored ? ENTRYINFO_FEATURE_BUDDYMIRRORED : 0;
int mode = getMode();
......
......@@ -9,14 +9,14 @@
#include <net/message/MirroredMessage.h>
class MkDirMsgEx : public MirroredMessage<MkDirMsg, std::tuple<DirIDLock, ParentNameLock>>
class MkDirMsgEx : public MirroredMessage<MkDirMsg, std::tuple<HashDirLock, DirIDLock, ParentNameLock>>
{
public:
typedef ErrorAndEntryResponseState<MkDirRespMsg, NETMSGTYPE_MkDir> ResponseState;
virtual bool processIncoming(ResponseContext& ctx) override;
std::tuple<DirIDLock, ParentNameLock> lock(EntryLockStore& store) override;
std::tuple<HashDirLock, DirIDLock, ParentNameLock> lock(EntryLockStore& store) override;
std::unique_ptr<MirroredMessageResponseState> executeLocally(ResponseContext& ctx,
bool isSecondary) override;
......@@ -24,6 +24,8 @@ class MkDirMsgEx : public MirroredMessage<MkDirMsg, std::tuple<DirIDLock, Parent
bool isMirrored() override { return getParentInfo()->getIsBuddyMirrored(); }
private:
std::string entryID;
std::unique_ptr<ResponseState> mkDirPrimary(ResponseContext& ctx);
std::unique_ptr<ResponseState> mkDirSecondary();
......
......@@ -4,10 +4,18 @@
#include "MkLocalDirMsgEx.h"
std::tuple<> MkLocalDirMsgEx::lock(EntryLockStore& store)
HashDirLock MkLocalDirMsgEx::lock(EntryLockStore& store)
{
// we need not lock anything here, because the inode ID will be completely unknown to anyone
// until we finish processing here *and* on the metadata server that sent this message.
// we usually need not lock anything here, because the inode ID will be completely unknown to
// anyone until we finish processing here *and* on the metadata server that sent this message.
// during resync though we need to lock the hash dir to avoid interefence between bulk resync and
// mod resync.
// do not lock the hash dir if we are creating the inode on the same meta node as the dentry,
// MkDir will have already locked the hash dir.
if (!rctx->isLocallyGenerated() && resyncJob && resyncJob->isRunning())
return {&store, MetaStorageTk::getMetaInodeHash(getEntryInfo()->getEntryID())};
return {};
}
......@@ -20,6 +28,8 @@ bool MkLocalDirMsgEx::processIncoming(ResponseContext& ctx)
as("entryName", entryInfo->getFileName()));
(void) entryInfo;
rctx = &ctx;
return BaseType::processIncoming(ctx);
}
......
......@@ -8,7 +8,7 @@
#include <session/EntryLock.h>
#include <storage/MetaStore.h>
class MkLocalDirMsgEx : public MirroredMessage<MkLocalDirMsg, std::tuple<>>
class MkLocalDirMsgEx : public MirroredMessage<MkLocalDirMsg, HashDirLock>
{
public:
typedef ErrorCodeResponseState<MkLocalDirRespMsg, NETMSGTYPE_MkLocalDir> ResponseState;
......@@ -18,11 +18,13 @@ class MkLocalDirMsgEx : public MirroredMessage<MkLocalDirMsg, std::tuple<>>
std::unique_ptr<MirroredMessageResponseState> executeLocally(ResponseContext& ctx,
bool isSecondary) override;
std::tuple<> lock(EntryLockStore& store) override;
HashDirLock lock(EntryLockStore& store) override;
bool isMirrored() override { return getEntryInfo()->getIsBuddyMirrored(); }
private:
ResponseContext* rctx;
bool forwardToSecondary(ResponseContext& ctx) override;
FhgfsOpsErr processSecondaryResponse(NetMessage& resp) override
......
......@@ -27,7 +27,7 @@ bool RmDirMsgEx::processIncoming(ResponseContext& ctx)
return true;
}
std::tuple<DirIDLock, DirIDLock, ParentNameLock> RmDirMsgEx::lock(EntryLockStore& store)
std::tuple<HashDirLock, DirIDLock, DirIDLock, ParentNameLock> RmDirMsgEx::lock(EntryLockStore& store)
{
MetaStore* metaStore = Program::getApp()->getMetaStore();
EntryInfo delEntryInfo;
......@@ -44,6 +44,10 @@ std::tuple<DirIDLock, DirIDLock, ParentNameLock> RmDirMsgEx::lock(EntryLockStore
DirIDLock parentDirLock;
DirIDLock delDirLock;
HashDirLock hashLock;
if (resyncJob && resyncJob->isRunning())
hashLock = {&store, MetaStorageTk::getMetaInodeHash(delEntryInfo.getEntryID())};
// lock directories in deadlock-avoidance order, see MirroredMessage::lock()
if (delEntryInfo.getEntryID() < getParentInfo()->getEntryID())
......@@ -64,6 +68,7 @@ std::tuple<DirIDLock, DirIDLock, ParentNameLock> RmDirMsgEx::lock(EntryLockStore
ParentNameLock parentNameLock(&store, getParentInfo()->getEntryID(), getDelDirName());
return std::make_tuple(
std::move(hashLock),
std::move(parentDirLock),
std::move(delDirLock),
std::move(parentNameLock));
......
......@@ -9,14 +9,15 @@
class RmDirMsgEx : public MirroredMessage<RmDirMsg,
std::tuple<DirIDLock, DirIDLock, ParentNameLock>>
std::tuple<HashDirLock, DirIDLock, DirIDLock, ParentNameLock>>
{
public:
typedef ErrorCodeResponseState<RmDirRespMsg, NETMSGTYPE_RmDir> ResponseState;
virtual bool processIncoming(ResponseContext& ctx) override;
std::tuple<DirIDLock, DirIDLock, ParentNameLock> lock(EntryLockStore& store) override;
std::tuple<HashDirLock, DirIDLock, DirIDLock, ParentNameLock>
lock(EntryLockStore& store) override;
static FhgfsOpsErr rmRemoteDirInode(EntryInfo* delEntryInfo);
......
#include <program/Program.h>
#include <common/net/message/storage/creating/RmLocalDirRespMsg.h>
#include <common/toolkit/MetaStorageTk.h>
#include "RmLocalDirMsgEx.h"
DirIDLock RmLocalDirMsgEx::lock(EntryLockStore& store)
std::tuple<HashDirLock, DirIDLock> RmLocalDirMsgEx::lock(EntryLockStore& store)
{
return {&store, getDelEntryInfo()->getEntryID(), true};
HashDirLock hashLock;
// if we are currently under modsync, we must lock the hash dir from which we are removing the
// inode. otherwise bulk sync may interfere with mod sync and cause the resync to fail.
// do not lock the hash dir if we are removing the inode from the same meta node as the dentry,
// RmDir will have already locked the hash dir.
if (!rctx->isLocallyGenerated() && resyncJob && resyncJob->isRunning())
hashLock = {&store, MetaStorageTk::getMetaInodeHash(getDelEntryInfo()->getEntryID())};
return std::make_tuple(
std::move(hashLock),
DirIDLock(&store, getDelEntryInfo()->getEntryID(), true));
}
bool RmLocalDirMsgEx::processIncoming(ResponseContext& ctx)
......@@ -17,6 +29,8 @@ bool RmLocalDirMsgEx::processIncoming(ResponseContext& ctx)
LOG_DEBUG(logContext, 4, "Received a RmLocalDirMsg from: " + ctx.peerName() );
#endif // BEEGFS_DEBUG
rctx = &ctx;
return BaseType::processIncoming(ctx);
}
......
......@@ -8,7 +8,7 @@
#include <storage/MetaStore.h>
#include <net/message/MirroredMessage.h>
class RmLocalDirMsgEx : public MirroredMessage<RmLocalDirMsg, DirIDLock>
class RmLocalDirMsgEx : public MirroredMessage<RmLocalDirMsg, std::tuple<HashDirLock, DirIDLock>>
{
public:
typedef ErrorCodeResponseState<RmLocalDirRespMsg, NETMSGTYPE_RmLocalDir> ResponseState;
......@@ -18,11 +18,13 @@ class RmLocalDirMsgEx : public MirroredMessage<RmLocalDirMsg, DirIDLock>
std::unique_ptr<MirroredMessageResponseState> executeLocally(ResponseContext& ctx,
bool isSecondary) override;
DirIDLock lock(EntryLockStore& store) override;
std::tuple<HashDirLock, DirIDLock> lock(EntryLockStore& store) override;
bool isMirrored() override { return getDelEntryInfo()->getIsBuddyMirrored(); }
private:
ResponseContext* rctx;
std::unique_ptr<ResponseState> rmDir();
bool forwardToSecondary(ResponseContext& ctx) override;
......
......@@ -20,6 +20,32 @@ struct RenameV2Locks
// if target exists, the target file must be unlocked to exclude concurrent operations on
// target (eg close, setxattr, ...)
FileIDLock unlinkedFileLock;
RenameV2Locks() = default;
RenameV2Locks(const RenameV2Locks&) = delete;
RenameV2Locks& operator=(const RenameV2Locks&) = delete;
RenameV2Locks(RenameV2Locks&& other)
{
swap(other);
}
RenameV2Locks& operator=(RenameV2Locks&& other)
{
RenameV2Locks(std::move(other)).swap(*this);
return *this;
}
void swap(RenameV2Locks& other)
{
std::swap(fromNameLock, other.fromNameLock);
std::swap(toNameLock, other.toNameLock);
std::swap(fromDirLock, other.fromDirLock);
std::swap(toDirLock, other.toDirLock);
std::swap(fromFileLock, other.fromFileLock);
std::swap(unlinkedFileLock, other.unlinkedFileLock);
}
};
class RenameV2MsgEx : public MirroredMessage<RenameMsg, RenameV2Locks>
......
......@@ -126,4 +126,25 @@ class ParentNameLock : UniqueEntryLockBase<ParentNameLockData>
}
};
class HashDirLock : UniqueEntryLockBase<HashDirLockData>
{
public:
HashDirLock() = default;
HashDirLock(const HashDirLock&) = delete;
HashDirLock& operator=(const HashDirLock&) = delete;
HashDirLock(HashDirLock&& src) : BaseType(std::move(src)) {}
HashDirLock& operator=(HashDirLock&& src)
{
BaseType::operator=(std::move(src));
return *this;
}
HashDirLock(EntryLockStore* entryLockStore, std::pair<unsigned, unsigned> hashDir)
: UniqueEntryLockBase<HashDirLockData>(entryLockStore, hashDir)
{
}
};
#endif /* ENTRYLOCK_H_ */
......@@ -28,6 +28,13 @@ DirIDLockData* EntryLockStore::lock(const std::string& dirID, const bool writeLo
return &lock;
}
HashDirLockData* EntryLockStore::lock(std::pair<unsigned, unsigned> hashDir)
{
HashDirLockData& lock = hashDirLocks.getLockFor(hashDir);
lock.getLock().lock();
return &lock;
}
void EntryLockStore::unlock(ParentNameLockData* parentNameLockData)
{
parentNameLockData->getLock().unlock();
......@@ -45,3 +52,9 @@ void EntryLockStore::unlock(DirIDLockData* dirIDLockData)
dirIDLockData->getLock().unlock();
dirLocks.putLock(*dirIDLockData);
}
void EntryLockStore::unlock(HashDirLockData* hashDirLockData)
{
hashDirLockData->getLock().unlock();
hashDirLocks.putLock(*hashDirLockData);
}
......@@ -196,6 +196,16 @@ struct ValueLockHash<std::pair<std::string, std::string> >
}
};
template<>
struct ValueLockHash<std::pair<unsigned, unsigned>>
{
uint32_t operator()(std::pair<unsigned, unsigned> pair) const
{
// optimized for the inode/dentries hash dir structure of 2 fragments of 7 bits each
return std::hash<unsigned>()((pair.first << 8) | pair.second);
}
};
typedef ValueLockStore<std::pair<std::string, std::string>, Mutex, 1024> ParentNameLockStore;
typedef ParentNameLockStore::ValueLock ParentNameLockData;
......@@ -205,21 +215,27 @@ typedef DirIDLockStore::ValueLock DirIDLockData;
typedef ValueLockStore<std::string, Mutex, 1024> FileIDLockStore;
typedef FileIDLockStore::ValueLock FileIDLockData;
typedef ValueLockStore<std::pair<unsigned, unsigned>, Mutex, 1024> HashDirLockStore;
typedef HashDirLockStore::ValueLock HashDirLockData;
class EntryLockStore
{
public:
ParentNameLockData* lock(const std::string& parentID, const std::string& name);
FileIDLockData* lock(const std::string& fileID);
DirIDLockData* lock(const std::string& dirID, const bool writeLock);
HashDirLockData* lock(std::pair<unsigned, unsigned> hashDir);
void unlock(ParentNameLockData* parentNameLock);
void unlock(FileIDLockData* fileIDLock);
void unlock(DirIDLockData* dirIDLock);
void unlock(HashDirLockData* hashDirLock);
private:
ParentNameLockStore parentNameLocks;
FileIDLockStore fileLocks;
DirIDLockStore dirLocks;
HashDirLockStore hashDirLocks;
};
#endif /* ENTRYLOCKSTORE_H_ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment