Relax SUS FDIR #677

Merged
muellerr merged 17 commits from relax-sus-fdir into main 2023-09-12 10:16:07 +02:00
13 changed files with 68 additions and 39 deletions
Showing only changes of commit 58961efb3f - Show all commits

View File

@ -1,7 +1,7 @@
/**
* @brief Auto-generated event translation file. Contains 305 translations.
* @brief Auto-generated event translation file. Contains 306 translations.
* @details
* Generated on: 2023-09-07 15:38:20
* Generated on: 2023-09-07 16:02:29
*/
#include "translateEvents.h"
@ -309,7 +309,8 @@ const char *DUMP_NOK_CANCELLED_STRING = "DUMP_NOK_CANCELLED";
const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED";
const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED";
const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED";
const char *TEMPERATURE_IS_ALL_ONES_STRING = "TEMPERATURE_IS_ALL_ONES";
const char *TEMPERATURE_ALL_ONES_START_STRING = "TEMPERATURE_ALL_ONES_START";
const char *TEMPERATURE_ALL_ONES_RECOVERY_STRING = "TEMPERATURE_ALL_ONES_RECOVERY";
const char *translateEvents(Event event) {
switch ((event & 0xFFFF)) {
@ -922,7 +923,9 @@ const char *translateEvents(Event event) {
case (14314):
return DUMP_CFDP_CANCELLED_STRING;
case (14500):
return TEMPERATURE_IS_ALL_ONES_STRING;
return TEMPERATURE_ALL_ONES_START_STRING;
case (14501):
return TEMPERATURE_ALL_ONES_RECOVERY_STRING;
default:
return "UNKNOWN_EVENT";
}

View File

@ -2,7 +2,7 @@
* @brief Auto-generated object translation file.
* @details
* Contains 171 translations.
* Generated on: 2023-09-07 15:38:20
* Generated on: 2023-09-07 16:02:29
*/
#include "translateObjects.h"

View File

@ -303,4 +303,5 @@ Event ID (dec); Event ID (hex); Name; Severity; Description; File Path
14312;0x37e8;DUMP_MISC_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14313;0x37e9;DUMP_HK_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14314;0x37ea;DUMP_CFDP_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14500;0x38a4;TEMPERATURE_IS_ALL_ONES;LOW;Possible indicator that the SUS device is glitchy;mission/acs/SusHandler.h
14500;0x38a4;TEMPERATURE_ALL_ONES_START;LOW;Detected invalid values, starting invalid message counting;mission/acs/SusHandler.h
14501;0x38a5;TEMPERATURE_ALL_ONES_RECOVERY;LOW;Detected valid values again, resetting invalid message counter. P1: Invalid message counter.;mission/acs/SusHandler.h

1 Event ID (dec) Event ID (hex) Name Severity Description File Path
303 14312 0x37e8 DUMP_MISC_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
304 14313 0x37e9 DUMP_HK_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
305 14314 0x37ea DUMP_CFDP_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
306 14500 0x38a4 TEMPERATURE_ALL_ONES_START LOW Detected invalid values, starting invalid message counting mission/acs/SusHandler.h
307 14501 0x38a5 TEMPERATURE_ALL_ONES_RECOVERY LOW Detected valid values again, resetting invalid message counter. P1: Invalid message counter. mission/acs/SusHandler.h

View File

@ -303,4 +303,5 @@ Event ID (dec); Event ID (hex); Name; Severity; Description; File Path
14312;0x37e8;DUMP_MISC_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14313;0x37e9;DUMP_HK_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14314;0x37ea;DUMP_CFDP_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14500;0x38a4;TEMPERATURE_IS_ALL_ONES;LOW;Possible indicator that the SUS device is glitchy;mission/acs/SusHandler.h
14500;0x38a4;TEMPERATURE_ALL_ONES_START;LOW;Detected invalid values, starting invalid message counting;mission/acs/SusHandler.h
14501;0x38a5;TEMPERATURE_ALL_ONES_RECOVERY;LOW;Detected valid values again, resetting invalid message counter. P1: Invalid message counter.;mission/acs/SusHandler.h

1 Event ID (dec) Event ID (hex) Name Severity Description File Path
303 14312 0x37e8 DUMP_MISC_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
304 14313 0x37e9 DUMP_HK_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
305 14314 0x37ea DUMP_CFDP_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
306 14500 0x38a4 TEMPERATURE_ALL_ONES_START LOW Detected invalid values, starting invalid message counting mission/acs/SusHandler.h
307 14501 0x38a5 TEMPERATURE_ALL_ONES_RECOVERY LOW Detected valid values again, resetting invalid message counter. P1: Invalid message counter. mission/acs/SusHandler.h

View File

@ -1,7 +1,7 @@
/**
* @brief Auto-generated event translation file. Contains 305 translations.
* @brief Auto-generated event translation file. Contains 306 translations.
* @details
* Generated on: 2023-09-07 15:38:20
* Generated on: 2023-09-07 16:02:29
*/
#include "translateEvents.h"
@ -309,7 +309,8 @@ const char *DUMP_NOK_CANCELLED_STRING = "DUMP_NOK_CANCELLED";
const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED";
const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED";
const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED";
const char *TEMPERATURE_IS_ALL_ONES_STRING = "TEMPERATURE_IS_ALL_ONES";
const char *TEMPERATURE_ALL_ONES_START_STRING = "TEMPERATURE_ALL_ONES_START";
const char *TEMPERATURE_ALL_ONES_RECOVERY_STRING = "TEMPERATURE_ALL_ONES_RECOVERY";
const char *translateEvents(Event event) {
switch ((event & 0xFFFF)) {
@ -922,7 +923,9 @@ const char *translateEvents(Event event) {
case (14314):
return DUMP_CFDP_CANCELLED_STRING;
case (14500):
return TEMPERATURE_IS_ALL_ONES_STRING;
return TEMPERATURE_ALL_ONES_START_STRING;
case (14501):
return TEMPERATURE_ALL_ONES_RECOVERY_STRING;
default:
return "UNKNOWN_EVENT";
}

View File

@ -2,7 +2,7 @@
* @brief Auto-generated object translation file.
* @details
* Contains 175 translations.
* Generated on: 2023-09-07 15:38:20
* Generated on: 2023-09-07 16:02:29
*/
#include "translateObjects.h"

View File

@ -1,7 +1,7 @@
/**
* @brief Auto-generated event translation file. Contains 305 translations.
* @brief Auto-generated event translation file. Contains 306 translations.
* @details
* Generated on: 2023-09-07 15:38:20
* Generated on: 2023-09-07 16:02:29
*/
#include "translateEvents.h"
@ -309,7 +309,8 @@ const char *DUMP_NOK_CANCELLED_STRING = "DUMP_NOK_CANCELLED";
const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED";
const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED";
const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED";
const char *TEMPERATURE_IS_ALL_ONES_STRING = "TEMPERATURE_IS_ALL_ONES";
const char *TEMPERATURE_ALL_ONES_START_STRING = "TEMPERATURE_ALL_ONES_START";
const char *TEMPERATURE_ALL_ONES_RECOVERY_STRING = "TEMPERATURE_ALL_ONES_RECOVERY";
const char *translateEvents(Event event) {
switch ((event & 0xFFFF)) {
@ -922,7 +923,9 @@ const char *translateEvents(Event event) {
case (14314):
return DUMP_CFDP_CANCELLED_STRING;
case (14500):
return TEMPERATURE_IS_ALL_ONES_STRING;
return TEMPERATURE_ALL_ONES_START_STRING;
case (14501):
return TEMPERATURE_ALL_ONES_RECOVERY_STRING;
default:
return "UNKNOWN_EVENT";
}

View File

@ -2,7 +2,7 @@
* @brief Auto-generated object translation file.
* @details
* Contains 175 translations.
* Generated on: 2023-09-07 15:38:20
* Generated on: 2023-09-07 16:02:29
*/
#include "translateObjects.h"

View File

@ -20,6 +20,9 @@ void SusHandler::doStartUp() {
}
if (internalState == InternalState::STARTUP) {
if (commandExecuted) {
if (waitingForRecovery) {
waitingForRecovery = false;
}
setMode(MODE_ON);
internalState = InternalState::NONE;
commandExecuted = false;
@ -88,26 +91,26 @@ ReturnValue_t SusHandler::interpretDeviceReply(DeviceCommandId_t id, const uint8
commandExecuted = true;
}
PoolReadGuard pg(&dataset);
// In a previous stricter FDIR variant, this was considered faulty communication and was already
// handled in the communication interface. However, the SUS devices probably glitch in orbit,
// so the FDIR was relaxed. The fault case check previously used now only leads to the dataset
// being marked invalid, shifting more responsibility of determining and setting SUS devices
// faulty to the operator.
// UPDATE: Step1: First determine how often and whether this happens at all
if (reply->tempRaw == 0xfff) {
// Prevent spam if a device is glitchy for prolonged periods by only triggering with a
// maximum interval.
if (faultyDataEventCd.hasTimedOut()) {
triggerEvent(TEMPERATURE_IS_ALL_ONES);
faultyDataEventCd.resetTimer();
// Simple FDIR variant to make the handler more robust to invalid messages which
// appear sometimes for the SUS device: Allow invalid message up to a certain threshold
// before triggering FDIR reactions.
if (reply->tempRaw == 0xfff and not waitingForRecovery) {
if (invalidMsgCounter == 0) {
triggerEvent(TEMPERATURE_ALL_ONES_START);
} else if (invalidMsgCounter == susMax1227::MAX_INVALID_MSG_COUNT) {
triggerEvent(DeviceHandlerIF::DEVICE_WANTS_HARD_REBOOT);
waitingForRecovery = true;
} else {
invalidMsgCounter++;
}
// dataset.setValidity(false, true);
// return returnvalue::OK;
dataset.setValidity(false, true);
dataset.tempC = thermal::INVALID_TEMPERATURE;
std::memset(dataset.channels.value, 0, sizeof(dataset.channels.value));
} else {
dataset.setValidity(true, true);
dataset.tempC = max1227::getTemperature(reply->tempRaw);
std::memcpy(dataset.channels.value, reply->channelsRaw, sizeof(reply->channelsRaw));
}
dataset.setValidity(true, true);
dataset.tempC = max1227::getTemperature(reply->tempRaw);
std::memcpy(dataset.channels.value, reply->channelsRaw, sizeof(reply->channelsRaw));
}
Review

maybe decrement here instead of resetting the counter?

maybe decrement here instead of resetting the counter?
Review

Then I'd change the event handling, not sure what would be best here.. Otherwise it could be a lot of events.

Then I'd change the event handling, not sure what would be best here.. Otherwise it could be a lot of events.
Review

not sure either. your call

not sure either. your call
return returnvalue::OK;
}

View File

@ -17,8 +17,13 @@ class SusHandler : public DeviceHandlerBase {
static const uint8_t INTERFACE_ID = CLASS_ID::SUS_HANDLER;
static const uint8_t SUBSYSTEM_ID = SUBSYSTEM_ID::SUS_HANDLER;
//! [EXPORT] : [COMMENT] Possible indicator that the SUS device is glitchy
static constexpr Event TEMPERATURE_IS_ALL_ONES = event::makeEvent(SUBSYSTEM_ID, 0, severity::LOW);
//! [EXPORT] : [COMMENT] Detected invalid values, starting invalid message counting
static constexpr Event TEMPERATURE_ALL_ONES_START =
event::makeEvent(SUBSYSTEM_ID, 0, severity::LOW);
muellerr marked this conversation as resolved Outdated

prob medium severity

prob medium severity
//! [EXPORT] : [COMMENT] Detected valid values again, resetting invalid message counter.
//! P1: Invalid message counter.
static constexpr Event TEMPERATURE_ALL_ONES_RECOVERY =
event::makeEvent(SUBSYSTEM_ID, 1, severity::LOW);
muellerr marked this conversation as resolved Outdated

prob info event

prob info event
SusHandler(uint32_t objectId, uint8_t susIdx, object_id_t deviceCommunication,
CookieIF *comCookie);
@ -46,10 +51,11 @@ class SusHandler : public DeviceHandlerBase {
LocalPoolDataSetBase *getDataSetHandle(sid_t sid) override;
private:
Countdown faultyDataEventCd = Countdown(60000);
susMax1227::SusDataset dataset;
acs::SusRequest request{};
uint8_t susIdx;
bool waitingForRecovery = true;
uint32_t invalidMsgCounter = 0;
uint32_t transitionDelay = 1000;
bool goToNormalMode = false;

View File

@ -8,6 +8,14 @@
namespace susMax1227 {
// This is 16 seconds for a polling frequency of 0.4 seconds.
static constexpr uint32_t MAX_INVALID_MSG_COUNT = 40;
// Using a decrement time of 32 seconds should cause faulty device incrementation to best faster
// the decrementation, so that FDIR reactions will eventuall be triggered.
// NOTE: Not used currently, we perform the strange reply check logic in the handler and trigger
// a reboot directly using the appropriate event.
static constexpr uint32_t FAULTY_COM_DECREMENT_TIME_MS = 32000;
static const DeviceCommandId_t NONE = 0x0; // Set when no command is pending
static const DeviceCommandId_t WRITE_SETUP = 1;

View File

@ -1,6 +1,7 @@
#include "SusFdir.h"
#include "eive/objects.h"
#include "mission/acs/susMax1227Helpers.h"
SusFdir::SusFdir(object_id_t sensorId)
: DeviceHandlerFailureIsolation(sensorId, objects::SUS_BOARD_ASS) {}

2
tmtc

@ -1 +1 @@
Subproject commit 957d756d1e0c9862ae18798b4c27f8f0da8a349b
Subproject commit d285b1caeca01709004983a4d2e795898325effb