Merge pull request 'PDEC possible fixes' (#393) from pdec_possible_fixes_and_fdir into develop
All checks were successful
EIVE/eive-obsw/pipeline/head This commit looks good

Reviewed-on: #393
Reviewed-by: Jakob Meier <meierj@irs.uni-stuttgart.de>
This commit is contained in:
Jakob Meier 2023-02-22 14:45:48 +01:00
commit e915f6cc90
3 changed files with 104 additions and 50 deletions

View File

@ -17,6 +17,17 @@ change warranting a new major release:
# [unreleased] # [unreleased]
## Added
- PDEC: Added basic FDIR to limit the number of allowed TC interrupts and to allow complete task
lockups in the case an IRQ is immediately re-raised by the PDEC module. This is done by only
allowing a certain number of handled IRQs (whether they yield a valid TC or not) during
time windows of one second. Right now, 800 IRQs/TCs are allowed per time window.
This time window is reset if a TC reception timeout after 500ms occurs. TBD whether the maximum
allowed number will be a configurable parameter. If the number of occured IRQs is exceeded,
an event is triggered and the task is delayed for 400 ms.
PR: https://egit.irs.uni-stuttgart.de/eive/eive-obsw/pulls/393
# [v1.29.1] # [v1.29.1]
## Fixed ## Fixed
@ -56,6 +67,8 @@ eive-tmtc: v2.13.0
- Patch version which compiles for EM - Patch version which compiles for EM
- CFDP Funnel bugfix: CCSDS wrapping was buggy and works properly now. - CFDP Funnel bugfix: CCSDS wrapping was buggy and works properly now.
- PDEC: Some adaptions to prevent task lockups on invalid FAR states.
PR: https://egit.irs.uni-stuttgart.de/eive/eive-obsw/pulls/393
- CMakeLists.txt fix which broke CI/CD builds when server could not retrieve full git SHA. - CMakeLists.txt fix which broke CI/CD builds when server could not retrieve full git SHA.
- Possible regression in the MAX31865 polling task: Using a `ManualCsLockGuard` for reconfiguring - Possible regression in the MAX31865 polling task: Using a `ManualCsLockGuard` for reconfiguring
and then polling the sensor is problematic, invalid sensor values will be read. and then polling the sensor is problematic, invalid sensor values will be read.

View File

@ -1,6 +1,7 @@
#include "PdecHandler.h" #include "PdecHandler.h"
#include <fcntl.h> #include <fcntl.h>
#include <fsfw/tasks/TaskFactory.h>
#include <poll.h> #include <poll.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <unistd.h> #include <unistd.h>
@ -113,7 +114,7 @@ ReturnValue_t PdecHandler::polledOperation() {
// Requires reconfiguration and reinitialization of PDEC // Requires reconfiguration and reinitialization of PDEC
triggerEvent(INVALID_FAR); triggerEvent(INVALID_FAR);
state = State::WAIT_FOR_RECOVERY; state = State::WAIT_FOR_RECOVERY;
return result; break;
} }
state = State::RUNNING; state = State::RUNNING;
break; break;
@ -145,8 +146,9 @@ ReturnValue_t PdecHandler::irqOperation() {
// Used to unmask IRQ // Used to unmask IRQ
uint32_t info = 1; uint32_t info = 1;
ssize_t nb = 0;
int ret = 0; interruptWindowCd.resetTimer();
// Clear interrupts with dummy read before unmasking the interrupt. Use a volatile to prevent // Clear interrupts with dummy read before unmasking the interrupt. Use a volatile to prevent
// read being optimized away. // read being optimized away.
volatile uint32_t dummy = *(registerBaseAddress + PDEC_PIR_OFFSET); volatile uint32_t dummy = *(registerBaseAddress + PDEC_PIR_OFFSET);
@ -157,7 +159,7 @@ ReturnValue_t PdecHandler::irqOperation() {
readCommandQueue(); readCommandQueue();
switch (state) { switch (state) {
case State::INIT: case State::INIT:
resetFarStatFlag(); result = resetFarStatFlag();
if (result != returnvalue::OK) { if (result != returnvalue::OK) {
// Requires reconfiguration and reinitialization of PDEC // Requires reconfiguration and reinitialization of PDEC
triggerEvent(INVALID_FAR); triggerEvent(INVALID_FAR);
@ -167,57 +169,16 @@ ReturnValue_t PdecHandler::irqOperation() {
state = State::RUNNING; state = State::RUNNING;
break; break;
case State::RUNNING: { case State::RUNNING: {
nb = write(fd, &info, sizeof(info)); checkAndHandleIrqs(fd, info);
if (nb != static_cast<ssize_t>(sizeof(info))) {
sif::error << "PdecHandler::irqOperation: Unmasking IRQ failed" << std::endl;
triggerEvent(WRITE_SYSCALL_ERROR_PDEC, errno);
close(fd);
state = State::INIT;
return returnvalue::FAILED;
}
struct pollfd fds = {.fd = fd, .events = POLLIN, .revents = 0};
ret = poll(&fds, 1, IRQ_TIMEOUT_MS);
if (ret == 0) {
// No TCs for timeout period
checkLocks();
lockCheckCd.resetTimer();
} else if (ret >= 1) {
nb = read(fd, &info, sizeof(info));
if (nb == static_cast<ssize_t>(sizeof(info))) {
uint32_t pisr = *(registerBaseAddress + PDEC_PISR_OFFSET);
if ((pisr & TC_NEW_MASK) == TC_NEW_MASK) {
// handle TC
handleNewTc();
}
if ((pisr & TC_ABORT_MASK) == TC_ABORT_MASK) {
tcAbortCounter += 1;
}
if ((pisr & NEW_FAR_MASK) == NEW_FAR_MASK) {
// Read FAR here
CURRENT_FAR = readFar();
checkFrameAna(CURRENT_FAR);
}
if (lockCheckCd.hasTimedOut()) {
checkLocks();
lockCheckCd.resetTimer();
}
// Clear interrupts with dummy read
dummy = *(registerBaseAddress + PDEC_PIR_OFFSET);
}
} else {
sif::error << "PdecHandler::irqOperation: Poll error with errno " << errno << ": "
<< strerror(errno) << std::endl;
triggerEvent(POLL_SYSCALL_ERROR_PDEC, errno);
close(fd);
state = State::INIT;
return returnvalue::FAILED;
}
break; break;
} }
case State::WAIT_FOR_RECOVERY: case State::WAIT_FOR_RECOVERY:
TaskFactory::delayTask(400);
break; break;
default: default:
// Should never happen.
sif::error << "PdecHandler::performOperation: Invalid state" << std::endl; sif::error << "PdecHandler::performOperation: Invalid state" << std::endl;
TaskFactory::delayTask(400);
break; break;
} }
} }
@ -226,6 +187,71 @@ ReturnValue_t PdecHandler::irqOperation() {
return returnvalue::OK; return returnvalue::OK;
} }
ReturnValue_t PdecHandler::checkAndHandleIrqs(int fd, uint32_t& info) {
ssize_t nb = write(fd, &info, sizeof(info));
if (nb != static_cast<ssize_t>(sizeof(info))) {
sif::error << "PdecHandler::irqOperation: Unmasking IRQ failed" << std::endl;
triggerEvent(WRITE_SYSCALL_ERROR_PDEC, errno);
close(fd);
state = State::INIT;
return returnvalue::FAILED;
}
struct pollfd fds = {.fd = fd, .events = POLLIN, .revents = 0};
int ret = poll(&fds, 1, IRQ_TIMEOUT_MS);
if (ret == 0) {
// No TCs for timeout period
checkLocks();
genericCheckCd.resetTimer();
resetIrqLimiters();
} else if (ret >= 1) {
// Interrupt handling.
nb = read(fd, &info, sizeof(info));
interruptCounter++;
if (nb == static_cast<ssize_t>(sizeof(info))) {
uint32_t pisr = *(registerBaseAddress + PDEC_PISR_OFFSET);
if ((pisr & TC_NEW_MASK) == TC_NEW_MASK) {
// handle TC
handleNewTc();
}
if ((pisr & TC_ABORT_MASK) == TC_ABORT_MASK) {
tcAbortCounter += 1;
}
if ((pisr & NEW_FAR_MASK) == NEW_FAR_MASK) {
// Read FAR here
CURRENT_FAR = readFar();
checkFrameAna(CURRENT_FAR);
}
// Clear interrupts with dummy read. Volatile is important here to prevent
// compiler opitmizations in release builds!
volatile uint32_t dummy = *(registerBaseAddress + PDEC_PIR_OFFSET);
static_cast<void>(dummy);
if (genericCheckCd.hasTimedOut()) {
checkLocks();
genericCheckCd.resetTimer();
if (interruptWindowCd.hasTimedOut()) {
if (interruptCounter >= MAX_ALLOWED_IRQS_PER_WINDOW) {
sif::error << "PdecHandler::irqOperation: Possible IRQ storm" << std::endl;
triggerEvent(TOO_MANY_IRQS, MAX_ALLOWED_IRQS_PER_WINDOW);
resetIrqLimiters();
TaskFactory::delayTask(400);
return returnvalue::FAILED;
}
resetIrqLimiters();
}
}
}
} else {
sif::error << "PdecHandler::irqOperation: Poll error with errno " << errno << ": "
<< strerror(errno) << std::endl;
triggerEvent(POLL_SYSCALL_ERROR_PDEC, errno);
close(fd);
state = State::INIT;
return returnvalue::FAILED;
}
return returnvalue::OK;
}
void PdecHandler::readCommandQueue(void) { void PdecHandler::readCommandQueue(void) {
CommandMessage commandMessage; CommandMessage commandMessage;
ReturnValue_t result = returnvalue::FAILED; ReturnValue_t result = returnvalue::FAILED;
@ -618,6 +644,11 @@ void PdecHandler::printPdecMon() {
uint32_t PdecHandler::readFar() { return *(registerBaseAddress + PDEC_FAR_OFFSET); } uint32_t PdecHandler::readFar() { return *(registerBaseAddress + PDEC_FAR_OFFSET); }
void PdecHandler::resetIrqLimiters() {
interruptWindowCd.resetTimer();
interruptCounter = 0;
}
std::string PdecHandler::getMonStatusString(uint32_t status) { std::string PdecHandler::getMonStatusString(uint32_t status) {
switch (status) { switch (status) {
case TC_CHANNEL_INACTIVE: case TC_CHANNEL_INACTIVE:

View File

@ -87,6 +87,8 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc
static const Event LOST_CARRIER_LOCK_PDEC = MAKE_EVENT(5, severity::INFO); static const Event LOST_CARRIER_LOCK_PDEC = MAKE_EVENT(5, severity::INFO);
//! [EXPORT] : [COMMENT] Lost bit lock //! [EXPORT] : [COMMENT] Lost bit lock
static const Event LOST_BIT_LOCK_PDEC = MAKE_EVENT(6, severity::INFO); static const Event LOST_BIT_LOCK_PDEC = MAKE_EVENT(6, severity::INFO);
//! [EXPORT] : [COMMENT] Too many IRQs over the time window of one second. P1: Allowed TCs
static constexpr Event TOO_MANY_IRQS = MAKE_EVENT(7, severity::MEDIUM);
static constexpr Event POLL_SYSCALL_ERROR_PDEC = static constexpr Event POLL_SYSCALL_ERROR_PDEC =
event::makeEvent(SUBSYSTEM_ID, 7, severity::MEDIUM); event::makeEvent(SUBSYSTEM_ID, 7, severity::MEDIUM);
static constexpr Event WRITE_SYSCALL_ERROR_PDEC = static constexpr Event WRITE_SYSCALL_ERROR_PDEC =
@ -180,6 +182,8 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc
// discarded // discarded
static const uint8_t MAP_CLK_FREQ = 2; static const uint8_t MAP_CLK_FREQ = 2;
static constexpr uint32_t MAX_ALLOWED_IRQS_PER_WINDOW = 800;
enum class FrameAna_t : uint8_t { enum class FrameAna_t : uint8_t {
ABANDONED_CLTU, ABANDONED_CLTU,
FRAME_DIRTY, FRAME_DIRTY,
@ -206,13 +210,16 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc
static uint32_t CURRENT_FAR; static uint32_t CURRENT_FAR;
Countdown lockCheckCd = Countdown(IRQ_TIMEOUT_MS); Countdown genericCheckCd = Countdown(IRQ_TIMEOUT_MS);
object_id_t tcDestinationId; object_id_t tcDestinationId;
AcceptsTelecommandsIF* tcDestination = nullptr; AcceptsTelecommandsIF* tcDestination = nullptr;
LinuxLibgpioIF* gpioComIF = nullptr; LinuxLibgpioIF* gpioComIF = nullptr;
uint32_t interruptCounter = 0;
Countdown interruptWindowCd = Countdown(1000);
/** /**
* Reset signal is required to hold PDEC in reset state until the configuration has been * Reset signal is required to hold PDEC in reset state until the configuration has been
* written to the appropriate memory space. * written to the appropriate memory space.
@ -259,6 +266,7 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc
ReturnValue_t polledOperation(); ReturnValue_t polledOperation();
ReturnValue_t irqOperation(); ReturnValue_t irqOperation();
ReturnValue_t checkAndHandleIrqs(int fd, uint32_t& info);
uint32_t readFar(); uint32_t readFar();
@ -294,6 +302,8 @@ class PdecHandler : public SystemObject, public ExecutableObjectIF, public HasAc
*/ */
void checkLocks(); void checkLocks();
void resetIrqLimiters();
/** /**
* @brief Analyzes the FramAna field (frame analysis data) of a FAR report. * @brief Analyzes the FramAna field (frame analysis data) of a FAR report.
* *