From 6b80daab0a998b380f296567ab1e1f467e7aaab1 Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Mon, 20 Feb 2023 18:12:33 +0100 Subject: [PATCH 1/6] maybe this caused the hangup issue? --- linux/ipcore/PdecHandler.cpp | 29 ++++++++++++++++++++++++----- linux/ipcore/PdecHandler.h | 9 ++++++++- mission/tmtc/CfdpTmFunnel.cpp | 6 +++--- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/linux/ipcore/PdecHandler.cpp b/linux/ipcore/PdecHandler.cpp index 3244280e..8eb2d00a 100644 --- a/linux/ipcore/PdecHandler.cpp +++ b/linux/ipcore/PdecHandler.cpp @@ -1,6 +1,7 @@ #include "PdecHandler.h" #include +#include #include #include #include @@ -113,7 +114,7 @@ ReturnValue_t PdecHandler::polledOperation() { // Requires reconfiguration and reinitialization of PDEC triggerEvent(INVALID_FAR); state = State::WAIT_FOR_RECOVERY; - return result; + break; } state = State::RUNNING; break; @@ -147,6 +148,9 @@ ReturnValue_t PdecHandler::irqOperation() { uint32_t info = 1; ssize_t nb = 0; int ret = 0; + + interruptWindowCd.resetTimer(); + // Clear interrupts with dummy read before unmasking the interrupt. Use a volatile to prevent // read being optimized away. volatile uint32_t dummy = *(registerBaseAddress + PDEC_PIR_OFFSET); @@ -157,7 +161,7 @@ ReturnValue_t PdecHandler::irqOperation() { readCommandQueue(); switch (state) { case State::INIT: - resetFarStatFlag(); + result = resetFarStatFlag(); if (result != returnvalue::OK) { // Requires reconfiguration and reinitialization of PDEC triggerEvent(INVALID_FAR); @@ -180,9 +184,11 @@ ReturnValue_t PdecHandler::irqOperation() { if (ret == 0) { // No TCs for timeout period checkLocks(); - lockCheckCd.resetTimer(); + genericCheckCd.resetTimer(); + interruptWindowCd.resetTimer(); } else if (ret >= 1) { nb = read(fd, &info, sizeof(info)); + interruptCounter++; if (nb == static_cast(sizeof(info))) { uint32_t pisr = *(registerBaseAddress + PDEC_PISR_OFFSET); if ((pisr & TC_NEW_MASK) == TC_NEW_MASK) { @@ -197,9 +203,19 @@ ReturnValue_t PdecHandler::irqOperation() { CURRENT_FAR = readFar(); checkFrameAna(CURRENT_FAR); } - if (lockCheckCd.hasTimedOut()) { + if (genericCheckCd.hasTimedOut()) { checkLocks(); - lockCheckCd.resetTimer(); + genericCheckCd.resetTimer(); + if (interruptWindowCd.timeOut()) { + if (interruptCounter >= MAX_ALLOWED_IRQS_PER_WINDOW) { + sif::error << "PdecHandler::irqOperation: Possible IRQ storm" << std::endl; + triggerEvent(TOO_MANY_IRQS, MAX_ALLOWED_IRQS_PER_WINDOW); + TaskFactory::delayTask(400); + break; + } + interruptWindowCd.resetTimer(); + interruptCounter = 0; + } } // Clear interrupts with dummy read dummy = *(registerBaseAddress + PDEC_PIR_OFFSET); @@ -215,9 +231,12 @@ ReturnValue_t PdecHandler::irqOperation() { break; } case State::WAIT_FOR_RECOVERY: + TaskFactory::delayTask(400); break; default: + // Should never happen. sif::error << "PdecHandler::performOperation: Invalid state" << std::endl; + TaskFactory::delayTask(400); break; } } diff --git a/linux/ipcore/PdecHandler.h b/linux/ipcore/PdecHandler.h index b514f501..e6da7724 100644 --- a/linux/ipcore/PdecHandler.h +++ b/linux/ipcore/PdecHandler.h @@ -87,6 +87,8 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc static const Event LOST_CARRIER_LOCK_PDEC = MAKE_EVENT(5, severity::INFO); //! [EXPORT] : [COMMENT] Lost bit lock static const Event LOST_BIT_LOCK_PDEC = MAKE_EVENT(6, severity::INFO); + //! [EXPORT] : [COMMENT] Too many IRQs over the time window of one second. P1: Allowed TCs + static constexpr Event TOO_MANY_IRQS = MAKE_EVENT(7, severity::MEDIUM); static constexpr Event POLL_SYSCALL_ERROR_PDEC = event::makeEvent(SUBSYSTEM_ID, 7, severity::MEDIUM); static constexpr Event WRITE_SYSCALL_ERROR_PDEC = @@ -180,6 +182,8 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc // discarded static const uint8_t MAP_CLK_FREQ = 2; + static constexpr uint32_t MAX_ALLOWED_IRQS_PER_WINDOW = 500; + enum class FrameAna_t : uint8_t { ABANDONED_CLTU, FRAME_DIRTY, @@ -206,13 +210,16 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc static uint32_t CURRENT_FAR; - Countdown lockCheckCd = Countdown(IRQ_TIMEOUT_MS); + Countdown genericCheckCd = Countdown(IRQ_TIMEOUT_MS); object_id_t tcDestinationId; AcceptsTelecommandsIF* tcDestination = nullptr; LinuxLibgpioIF* gpioComIF = nullptr; + uint32_t interruptCounter = 0; + Countdown interruptWindowCd = Countdown(1000); + /** * Reset signal is required to hold PDEC in reset state until the configuration has been * written to the appropriate memory space. diff --git a/mission/tmtc/CfdpTmFunnel.cpp b/mission/tmtc/CfdpTmFunnel.cpp index c057295a..e687911f 100644 --- a/mission/tmtc/CfdpTmFunnel.cpp +++ b/mission/tmtc/CfdpTmFunnel.cpp @@ -55,8 +55,7 @@ ReturnValue_t CfdpTmFunnel::handlePacket(TmTcMessage& msg) { } size_t packetLen = 0; uint8_t* serPtr = newPacketData; - result = - spacePacketHeader.serializeBe(&serPtr, &packetLen, spacePacketHeader.getFullPacketLen()); + result = spacePacketHeader.serializeBe(&serPtr, &packetLen, spacePacketHeader.getFullPacketLen()); if (result != returnvalue::OK) { #if FSFW_CPP_OSTREAM_ENABLED == 1 sif::error << "CfdpTmFunnel::handlePacket: Error serializing packet" << std::endl; @@ -83,7 +82,8 @@ ReturnValue_t CfdpTmFunnel::handlePacket(TmTcMessage& msg) { } else { #if FSFW_CPP_OSTREAM_ENABLED == 1 sif::error << "PusTmFunnel::handlePacket: Store too full to create data copy or store " - "error" << std::endl; + "error" + << std::endl; break; #endif } From 9285e13ce51c386bad8720977c3210380a0f0ccf Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Mon, 20 Feb 2023 18:17:27 +0100 Subject: [PATCH 2/6] increase allowed IRQs --- linux/ipcore/PdecHandler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux/ipcore/PdecHandler.h b/linux/ipcore/PdecHandler.h index e6da7724..c68926cf 100644 --- a/linux/ipcore/PdecHandler.h +++ b/linux/ipcore/PdecHandler.h @@ -182,7 +182,7 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc // discarded static const uint8_t MAP_CLK_FREQ = 2; - static constexpr uint32_t MAX_ALLOWED_IRQS_PER_WINDOW = 500; + static constexpr uint32_t MAX_ALLOWED_IRQS_PER_WINDOW = 800; enum class FrameAna_t : uint8_t { ABANDONED_CLTU, From 803c0b1a3e7b2dbee06d63ac1a3057c1c13a53b9 Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Mon, 20 Feb 2023 18:35:19 +0100 Subject: [PATCH 3/6] changelog --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71818173..ba982468 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ change warranting a new major release: ## Fixed - CFDP Funnel bugfix: CCSDS wrapping was buggy and works properly now. +- PDEC: Some adaptions to prevent task lockups on invalid FAR states. + PR: https://egit.irs.uni-stuttgart.de/eive/eive-obsw/pulls/393 ## Changed @@ -27,6 +29,14 @@ change warranting a new major release: ## Added +- PDEC: Added basic FDIR to limit the number of allowed TC interrupts and to allow complete task + lockups in the case an IRQ is immediately re-raised by the PDEC module. This is done by only + allowing a certain number of handled IRQs (whether they yield a valid TC or not) during + time windows of one second. Right now, 800 IRQs/TCs are allowed per time window. + This time window is reset if a TC reception timeout after 500ms occurs. TBD whether the maximum + allowed number will be a configurable parameter. If the number of occured IRQs is exceeded, + an event is triggered and the task is delayed for 400 ms. + PR: https://egit.irs.uni-stuttgart.de/eive/eive-obsw/pulls/393 - git post checkout hook which initializes and updates the submodules automatically. From bcce945cce5836928d592255024ea3b537e2692d Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Mon, 20 Feb 2023 18:36:22 +0100 Subject: [PATCH 4/6] reset counter as well --- linux/ipcore/PdecHandler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/linux/ipcore/PdecHandler.cpp b/linux/ipcore/PdecHandler.cpp index 8eb2d00a..81801101 100644 --- a/linux/ipcore/PdecHandler.cpp +++ b/linux/ipcore/PdecHandler.cpp @@ -185,6 +185,7 @@ ReturnValue_t PdecHandler::irqOperation() { // No TCs for timeout period checkLocks(); genericCheckCd.resetTimer(); + interruptCounter = 0; interruptWindowCd.resetTimer(); } else if (ret >= 1) { nb = read(fd, &info, sizeof(info)); From 94262a9d04de0b677f5fc8d39f33de65c66e881c Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Mon, 20 Feb 2023 18:41:26 +0100 Subject: [PATCH 5/6] common function for irq limiters --- linux/ipcore/PdecHandler.cpp | 11 +++++++---- linux/ipcore/PdecHandler.h | 2 ++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/linux/ipcore/PdecHandler.cpp b/linux/ipcore/PdecHandler.cpp index 81801101..aedfef9f 100644 --- a/linux/ipcore/PdecHandler.cpp +++ b/linux/ipcore/PdecHandler.cpp @@ -185,8 +185,7 @@ ReturnValue_t PdecHandler::irqOperation() { // No TCs for timeout period checkLocks(); genericCheckCd.resetTimer(); - interruptCounter = 0; - interruptWindowCd.resetTimer(); + resetIrqLimiters(); } else if (ret >= 1) { nb = read(fd, &info, sizeof(info)); interruptCounter++; @@ -214,8 +213,7 @@ ReturnValue_t PdecHandler::irqOperation() { TaskFactory::delayTask(400); break; } - interruptWindowCd.resetTimer(); - interruptCounter = 0; + resetIrqLimiters(); } } // Clear interrupts with dummy read @@ -638,6 +636,11 @@ void PdecHandler::printPdecMon() { uint32_t PdecHandler::readFar() { return *(registerBaseAddress + PDEC_FAR_OFFSET); } +void PdecHandler::resetIrqLimiters() { + interruptWindowCd.resetTimer(); + interruptCounter = 0; +} + std::string PdecHandler::getMonStatusString(uint32_t status) { switch (status) { case TC_CHANNEL_INACTIVE: diff --git a/linux/ipcore/PdecHandler.h b/linux/ipcore/PdecHandler.h index c68926cf..5348f120 100644 --- a/linux/ipcore/PdecHandler.h +++ b/linux/ipcore/PdecHandler.h @@ -301,6 +301,8 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc */ void checkLocks(); + void resetIrqLimiters(); + /** * @brief Analyzes the FramAna field (frame analysis data) of a FAR report. * From 60b07035b1ee9c386ee87336dab53a3ccf44621c Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Mon, 20 Feb 2023 18:53:56 +0100 Subject: [PATCH 6/6] separate function, code less confusing --- linux/ipcore/PdecHandler.cpp | 124 +++++++++++++++++++---------------- linux/ipcore/PdecHandler.h | 1 + 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/linux/ipcore/PdecHandler.cpp b/linux/ipcore/PdecHandler.cpp index aedfef9f..5ea85516 100644 --- a/linux/ipcore/PdecHandler.cpp +++ b/linux/ipcore/PdecHandler.cpp @@ -146,8 +146,6 @@ ReturnValue_t PdecHandler::irqOperation() { // Used to unmask IRQ uint32_t info = 1; - ssize_t nb = 0; - int ret = 0; interruptWindowCd.resetTimer(); @@ -171,62 +169,7 @@ ReturnValue_t PdecHandler::irqOperation() { state = State::RUNNING; break; case State::RUNNING: { - nb = write(fd, &info, sizeof(info)); - if (nb != static_cast(sizeof(info))) { - sif::error << "PdecHandler::irqOperation: Unmasking IRQ failed" << std::endl; - triggerEvent(WRITE_SYSCALL_ERROR_PDEC, errno); - close(fd); - state = State::INIT; - return returnvalue::FAILED; - } - struct pollfd fds = {.fd = fd, .events = POLLIN, .revents = 0}; - ret = poll(&fds, 1, IRQ_TIMEOUT_MS); - if (ret == 0) { - // No TCs for timeout period - checkLocks(); - genericCheckCd.resetTimer(); - resetIrqLimiters(); - } else if (ret >= 1) { - nb = read(fd, &info, sizeof(info)); - interruptCounter++; - if (nb == static_cast(sizeof(info))) { - uint32_t pisr = *(registerBaseAddress + PDEC_PISR_OFFSET); - if ((pisr & TC_NEW_MASK) == TC_NEW_MASK) { - // handle TC - handleNewTc(); - } - if ((pisr & TC_ABORT_MASK) == TC_ABORT_MASK) { - tcAbortCounter += 1; - } - if ((pisr & NEW_FAR_MASK) == NEW_FAR_MASK) { - // Read FAR here - CURRENT_FAR = readFar(); - checkFrameAna(CURRENT_FAR); - } - if (genericCheckCd.hasTimedOut()) { - checkLocks(); - genericCheckCd.resetTimer(); - if (interruptWindowCd.timeOut()) { - if (interruptCounter >= MAX_ALLOWED_IRQS_PER_WINDOW) { - sif::error << "PdecHandler::irqOperation: Possible IRQ storm" << std::endl; - triggerEvent(TOO_MANY_IRQS, MAX_ALLOWED_IRQS_PER_WINDOW); - TaskFactory::delayTask(400); - break; - } - resetIrqLimiters(); - } - } - // Clear interrupts with dummy read - dummy = *(registerBaseAddress + PDEC_PIR_OFFSET); - } - } else { - sif::error << "PdecHandler::irqOperation: Poll error with errno " << errno << ": " - << strerror(errno) << std::endl; - triggerEvent(POLL_SYSCALL_ERROR_PDEC, errno); - close(fd); - state = State::INIT; - return returnvalue::FAILED; - } + checkAndHandleIrqs(fd, info); break; } case State::WAIT_FOR_RECOVERY: @@ -244,6 +187,71 @@ ReturnValue_t PdecHandler::irqOperation() { return returnvalue::OK; } +ReturnValue_t PdecHandler::checkAndHandleIrqs(int fd, uint32_t& info) { + ssize_t nb = write(fd, &info, sizeof(info)); + if (nb != static_cast(sizeof(info))) { + sif::error << "PdecHandler::irqOperation: Unmasking IRQ failed" << std::endl; + triggerEvent(WRITE_SYSCALL_ERROR_PDEC, errno); + close(fd); + state = State::INIT; + return returnvalue::FAILED; + } + struct pollfd fds = {.fd = fd, .events = POLLIN, .revents = 0}; + int ret = poll(&fds, 1, IRQ_TIMEOUT_MS); + if (ret == 0) { + // No TCs for timeout period + checkLocks(); + genericCheckCd.resetTimer(); + resetIrqLimiters(); + } else if (ret >= 1) { + // Interrupt handling. + nb = read(fd, &info, sizeof(info)); + interruptCounter++; + if (nb == static_cast(sizeof(info))) { + uint32_t pisr = *(registerBaseAddress + PDEC_PISR_OFFSET); + if ((pisr & TC_NEW_MASK) == TC_NEW_MASK) { + // handle TC + handleNewTc(); + } + if ((pisr & TC_ABORT_MASK) == TC_ABORT_MASK) { + tcAbortCounter += 1; + } + if ((pisr & NEW_FAR_MASK) == NEW_FAR_MASK) { + // Read FAR here + CURRENT_FAR = readFar(); + checkFrameAna(CURRENT_FAR); + } + // Clear interrupts with dummy read. Volatile is important here to prevent + // compiler opitmizations in release builds! + volatile uint32_t dummy = *(registerBaseAddress + PDEC_PIR_OFFSET); + static_cast(dummy); + + if (genericCheckCd.hasTimedOut()) { + checkLocks(); + genericCheckCd.resetTimer(); + if (interruptWindowCd.hasTimedOut()) { + if (interruptCounter >= MAX_ALLOWED_IRQS_PER_WINDOW) { + sif::error << "PdecHandler::irqOperation: Possible IRQ storm" << std::endl; + triggerEvent(TOO_MANY_IRQS, MAX_ALLOWED_IRQS_PER_WINDOW); + resetIrqLimiters(); + TaskFactory::delayTask(400); + return returnvalue::FAILED; + } + resetIrqLimiters(); + } + } + } + } else { + sif::error << "PdecHandler::irqOperation: Poll error with errno " << errno << ": " + << strerror(errno) << std::endl; + triggerEvent(POLL_SYSCALL_ERROR_PDEC, errno); + close(fd); + state = State::INIT; + return returnvalue::FAILED; + } + return returnvalue::OK; +} + void PdecHandler::readCommandQueue(void) { CommandMessage commandMessage; ReturnValue_t result = returnvalue::FAILED; diff --git a/linux/ipcore/PdecHandler.h b/linux/ipcore/PdecHandler.h index 5348f120..51cf750e 100644 --- a/linux/ipcore/PdecHandler.h +++ b/linux/ipcore/PdecHandler.h @@ -266,6 +266,7 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc ReturnValue_t polledOperation(); ReturnValue_t irqOperation(); + ReturnValue_t checkAndHandleIrqs(int fd, uint32_t& info); uint32_t readFar();