Relax SUS FDIR #677

Merged
muellerr merged 17 commits from relax-sus-fdir into main 2023-09-12 10:16:07 +02:00
13 changed files with 68 additions and 39 deletions
Showing only changes of commit 58961efb3f - Show all commits

View File

@ -1,7 +1,7 @@
/** /**
* @brief Auto-generated event translation file. Contains 305 translations. * @brief Auto-generated event translation file. Contains 306 translations.
* @details * @details
* Generated on: 2023-09-07 15:38:20 * Generated on: 2023-09-07 16:02:29
*/ */
#include "translateEvents.h" #include "translateEvents.h"
@ -309,7 +309,8 @@ const char *DUMP_NOK_CANCELLED_STRING = "DUMP_NOK_CANCELLED";
const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED"; const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED";
const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED"; const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED";
const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED"; const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED";
const char *TEMPERATURE_IS_ALL_ONES_STRING = "TEMPERATURE_IS_ALL_ONES"; const char *TEMPERATURE_ALL_ONES_START_STRING = "TEMPERATURE_ALL_ONES_START";
const char *TEMPERATURE_ALL_ONES_RECOVERY_STRING = "TEMPERATURE_ALL_ONES_RECOVERY";
const char *translateEvents(Event event) { const char *translateEvents(Event event) {
switch ((event & 0xFFFF)) { switch ((event & 0xFFFF)) {
@ -922,7 +923,9 @@ const char *translateEvents(Event event) {
case (14314): case (14314):
return DUMP_CFDP_CANCELLED_STRING; return DUMP_CFDP_CANCELLED_STRING;
case (14500): case (14500):
return TEMPERATURE_IS_ALL_ONES_STRING; return TEMPERATURE_ALL_ONES_START_STRING;
case (14501):
return TEMPERATURE_ALL_ONES_RECOVERY_STRING;
default: default:
return "UNKNOWN_EVENT"; return "UNKNOWN_EVENT";
} }

View File

@ -2,7 +2,7 @@
* @brief Auto-generated object translation file. * @brief Auto-generated object translation file.
* @details * @details
* Contains 171 translations. * Contains 171 translations.
* Generated on: 2023-09-07 15:38:20 * Generated on: 2023-09-07 16:02:29
*/ */
#include "translateObjects.h" #include "translateObjects.h"

View File

@ -303,4 +303,5 @@ Event ID (dec); Event ID (hex); Name; Severity; Description; File Path
14312;0x37e8;DUMP_MISC_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h 14312;0x37e8;DUMP_MISC_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14313;0x37e9;DUMP_HK_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h 14313;0x37e9;DUMP_HK_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14314;0x37ea;DUMP_CFDP_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h 14314;0x37ea;DUMP_CFDP_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14500;0x38a4;TEMPERATURE_IS_ALL_ONES;LOW;Possible indicator that the SUS device is glitchy;mission/acs/SusHandler.h 14500;0x38a4;TEMPERATURE_ALL_ONES_START;LOW;Detected invalid values, starting invalid message counting;mission/acs/SusHandler.h
14501;0x38a5;TEMPERATURE_ALL_ONES_RECOVERY;LOW;Detected valid values again, resetting invalid message counter. P1: Invalid message counter.;mission/acs/SusHandler.h

1 Event ID (dec) Event ID (hex) Name Severity Description File Path
303 14312 0x37e8 DUMP_MISC_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
304 14313 0x37e9 DUMP_HK_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
305 14314 0x37ea DUMP_CFDP_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
306 14500 0x38a4 TEMPERATURE_ALL_ONES_START LOW Detected invalid values, starting invalid message counting mission/acs/SusHandler.h
307 14501 0x38a5 TEMPERATURE_ALL_ONES_RECOVERY LOW Detected valid values again, resetting invalid message counter. P1: Invalid message counter. mission/acs/SusHandler.h

View File

@ -303,4 +303,5 @@ Event ID (dec); Event ID (hex); Name; Severity; Description; File Path
14312;0x37e8;DUMP_MISC_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h 14312;0x37e8;DUMP_MISC_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14313;0x37e9;DUMP_HK_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h 14313;0x37e9;DUMP_HK_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14314;0x37ea;DUMP_CFDP_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h 14314;0x37ea;DUMP_CFDP_CANCELLED;LOW;P1: Number of dumped packets. P2: Total dumped bytes.;mission/persistentTmStoreDefs.h
14500;0x38a4;TEMPERATURE_IS_ALL_ONES;LOW;Possible indicator that the SUS device is glitchy;mission/acs/SusHandler.h 14500;0x38a4;TEMPERATURE_ALL_ONES_START;LOW;Detected invalid values, starting invalid message counting;mission/acs/SusHandler.h
14501;0x38a5;TEMPERATURE_ALL_ONES_RECOVERY;LOW;Detected valid values again, resetting invalid message counter. P1: Invalid message counter.;mission/acs/SusHandler.h

1 Event ID (dec) Event ID (hex) Name Severity Description File Path
303 14312 0x37e8 DUMP_MISC_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
304 14313 0x37e9 DUMP_HK_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
305 14314 0x37ea DUMP_CFDP_CANCELLED LOW P1: Number of dumped packets. P2: Total dumped bytes. mission/persistentTmStoreDefs.h
306 14500 0x38a4 TEMPERATURE_ALL_ONES_START LOW Detected invalid values, starting invalid message counting mission/acs/SusHandler.h
307 14501 0x38a5 TEMPERATURE_ALL_ONES_RECOVERY LOW Detected valid values again, resetting invalid message counter. P1: Invalid message counter. mission/acs/SusHandler.h

View File

@ -1,7 +1,7 @@
/** /**
* @brief Auto-generated event translation file. Contains 305 translations. * @brief Auto-generated event translation file. Contains 306 translations.
* @details * @details
* Generated on: 2023-09-07 15:38:20 * Generated on: 2023-09-07 16:02:29
*/ */
#include "translateEvents.h" #include "translateEvents.h"
@ -309,7 +309,8 @@ const char *DUMP_NOK_CANCELLED_STRING = "DUMP_NOK_CANCELLED";
const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED"; const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED";
const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED"; const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED";
const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED"; const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED";
const char *TEMPERATURE_IS_ALL_ONES_STRING = "TEMPERATURE_IS_ALL_ONES"; const char *TEMPERATURE_ALL_ONES_START_STRING = "TEMPERATURE_ALL_ONES_START";
const char *TEMPERATURE_ALL_ONES_RECOVERY_STRING = "TEMPERATURE_ALL_ONES_RECOVERY";
const char *translateEvents(Event event) { const char *translateEvents(Event event) {
switch ((event & 0xFFFF)) { switch ((event & 0xFFFF)) {
@ -922,7 +923,9 @@ const char *translateEvents(Event event) {
case (14314): case (14314):
return DUMP_CFDP_CANCELLED_STRING; return DUMP_CFDP_CANCELLED_STRING;
case (14500): case (14500):
return TEMPERATURE_IS_ALL_ONES_STRING; return TEMPERATURE_ALL_ONES_START_STRING;
case (14501):
return TEMPERATURE_ALL_ONES_RECOVERY_STRING;
default: default:
return "UNKNOWN_EVENT"; return "UNKNOWN_EVENT";
} }

View File

@ -2,7 +2,7 @@
* @brief Auto-generated object translation file. * @brief Auto-generated object translation file.
* @details * @details
* Contains 175 translations. * Contains 175 translations.
* Generated on: 2023-09-07 15:38:20 * Generated on: 2023-09-07 16:02:29
*/ */
#include "translateObjects.h" #include "translateObjects.h"

View File

@ -1,7 +1,7 @@
/** /**
* @brief Auto-generated event translation file. Contains 305 translations. * @brief Auto-generated event translation file. Contains 306 translations.
* @details * @details
* Generated on: 2023-09-07 15:38:20 * Generated on: 2023-09-07 16:02:29
*/ */
#include "translateEvents.h" #include "translateEvents.h"
@ -309,7 +309,8 @@ const char *DUMP_NOK_CANCELLED_STRING = "DUMP_NOK_CANCELLED";
const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED"; const char *DUMP_MISC_CANCELLED_STRING = "DUMP_MISC_CANCELLED";
const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED"; const char *DUMP_HK_CANCELLED_STRING = "DUMP_HK_CANCELLED";
const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED"; const char *DUMP_CFDP_CANCELLED_STRING = "DUMP_CFDP_CANCELLED";
const char *TEMPERATURE_IS_ALL_ONES_STRING = "TEMPERATURE_IS_ALL_ONES"; const char *TEMPERATURE_ALL_ONES_START_STRING = "TEMPERATURE_ALL_ONES_START";
const char *TEMPERATURE_ALL_ONES_RECOVERY_STRING = "TEMPERATURE_ALL_ONES_RECOVERY";
const char *translateEvents(Event event) { const char *translateEvents(Event event) {
switch ((event & 0xFFFF)) { switch ((event & 0xFFFF)) {
@ -922,7 +923,9 @@ const char *translateEvents(Event event) {
case (14314): case (14314):
return DUMP_CFDP_CANCELLED_STRING; return DUMP_CFDP_CANCELLED_STRING;
case (14500): case (14500):
return TEMPERATURE_IS_ALL_ONES_STRING; return TEMPERATURE_ALL_ONES_START_STRING;
case (14501):
return TEMPERATURE_ALL_ONES_RECOVERY_STRING;
default: default:
return "UNKNOWN_EVENT"; return "UNKNOWN_EVENT";
} }

View File

@ -2,7 +2,7 @@
* @brief Auto-generated object translation file. * @brief Auto-generated object translation file.
* @details * @details
* Contains 175 translations. * Contains 175 translations.
* Generated on: 2023-09-07 15:38:20 * Generated on: 2023-09-07 16:02:29
*/ */
#include "translateObjects.h" #include "translateObjects.h"

View File

@ -20,6 +20,9 @@ void SusHandler::doStartUp() {
} }
if (internalState == InternalState::STARTUP) { if (internalState == InternalState::STARTUP) {
if (commandExecuted) { if (commandExecuted) {
if (waitingForRecovery) {
waitingForRecovery = false;
}
setMode(MODE_ON); setMode(MODE_ON);
internalState = InternalState::NONE; internalState = InternalState::NONE;
commandExecuted = false; commandExecuted = false;
@ -88,26 +91,26 @@ ReturnValue_t SusHandler::interpretDeviceReply(DeviceCommandId_t id, const uint8
commandExecuted = true; commandExecuted = true;
} }
PoolReadGuard pg(&dataset); PoolReadGuard pg(&dataset);
// In a previous stricter FDIR variant, this was considered faulty communication and was already // Simple FDIR variant to make the handler more robust to invalid messages which
// handled in the communication interface. However, the SUS devices probably glitch in orbit, // appear sometimes for the SUS device: Allow invalid message up to a certain threshold
// so the FDIR was relaxed. The fault case check previously used now only leads to the dataset // before triggering FDIR reactions.
// being marked invalid, shifting more responsibility of determining and setting SUS devices if (reply->tempRaw == 0xfff and not waitingForRecovery) {
// faulty to the operator. if (invalidMsgCounter == 0) {
triggerEvent(TEMPERATURE_ALL_ONES_START);
// UPDATE: Step1: First determine how often and whether this happens at all } else if (invalidMsgCounter == susMax1227::MAX_INVALID_MSG_COUNT) {
if (reply->tempRaw == 0xfff) { triggerEvent(DeviceHandlerIF::DEVICE_WANTS_HARD_REBOOT);
// Prevent spam if a device is glitchy for prolonged periods by only triggering with a waitingForRecovery = true;
// maximum interval. } else {
if (faultyDataEventCd.hasTimedOut()) { invalidMsgCounter++;
triggerEvent(TEMPERATURE_IS_ALL_ONES);
faultyDataEventCd.resetTimer();
} }
// dataset.setValidity(false, true); dataset.setValidity(false, true);
// return returnvalue::OK; dataset.tempC = thermal::INVALID_TEMPERATURE;
std::memset(dataset.channels.value, 0, sizeof(dataset.channels.value));
} else {
dataset.setValidity(true, true);
dataset.tempC = max1227::getTemperature(reply->tempRaw);
std::memcpy(dataset.channels.value, reply->channelsRaw, sizeof(reply->channelsRaw));
} }
dataset.setValidity(true, true);
dataset.tempC = max1227::getTemperature(reply->tempRaw);
std::memcpy(dataset.channels.value, reply->channelsRaw, sizeof(reply->channelsRaw));
} }
Review

maybe decrement here instead of resetting the counter?

maybe decrement here instead of resetting the counter?
Review

Then I'd change the event handling, not sure what would be best here.. Otherwise it could be a lot of events.

Then I'd change the event handling, not sure what would be best here.. Otherwise it could be a lot of events.
Review

not sure either. your call

not sure either. your call
return returnvalue::OK; return returnvalue::OK;
} }

View File

@ -17,8 +17,13 @@ class SusHandler : public DeviceHandlerBase {
static const uint8_t INTERFACE_ID = CLASS_ID::SUS_HANDLER; static const uint8_t INTERFACE_ID = CLASS_ID::SUS_HANDLER;
static const uint8_t SUBSYSTEM_ID = SUBSYSTEM_ID::SUS_HANDLER; static const uint8_t SUBSYSTEM_ID = SUBSYSTEM_ID::SUS_HANDLER;
//! [EXPORT] : [COMMENT] Possible indicator that the SUS device is glitchy //! [EXPORT] : [COMMENT] Detected invalid values, starting invalid message counting
static constexpr Event TEMPERATURE_IS_ALL_ONES = event::makeEvent(SUBSYSTEM_ID, 0, severity::LOW); static constexpr Event TEMPERATURE_ALL_ONES_START =
event::makeEvent(SUBSYSTEM_ID, 0, severity::LOW);
//! [EXPORT] : [COMMENT] Detected valid values again, resetting invalid message counter.
//! P1: Invalid message counter.
static constexpr Event TEMPERATURE_ALL_ONES_RECOVERY =
event::makeEvent(SUBSYSTEM_ID, 1, severity::LOW);
SusHandler(uint32_t objectId, uint8_t susIdx, object_id_t deviceCommunication, SusHandler(uint32_t objectId, uint8_t susIdx, object_id_t deviceCommunication,
CookieIF *comCookie); CookieIF *comCookie);
@ -46,10 +51,11 @@ class SusHandler : public DeviceHandlerBase {
LocalPoolDataSetBase *getDataSetHandle(sid_t sid) override; LocalPoolDataSetBase *getDataSetHandle(sid_t sid) override;
private: private:
Countdown faultyDataEventCd = Countdown(60000);
susMax1227::SusDataset dataset; susMax1227::SusDataset dataset;
acs::SusRequest request{}; acs::SusRequest request{};
uint8_t susIdx; uint8_t susIdx;
bool waitingForRecovery = true;
uint32_t invalidMsgCounter = 0;
uint32_t transitionDelay = 1000; uint32_t transitionDelay = 1000;
bool goToNormalMode = false; bool goToNormalMode = false;

View File

@ -8,6 +8,14 @@
namespace susMax1227 { namespace susMax1227 {
// This is 16 seconds for a polling frequency of 0.4 seconds.
static constexpr uint32_t MAX_INVALID_MSG_COUNT = 40;
// Using a decrement time of 32 seconds should cause faulty device incrementation to best faster
// the decrementation, so that FDIR reactions will eventuall be triggered.
// NOTE: Not used currently, we perform the strange reply check logic in the handler and trigger
// a reboot directly using the appropriate event.
static constexpr uint32_t FAULTY_COM_DECREMENT_TIME_MS = 32000;
static const DeviceCommandId_t NONE = 0x0; // Set when no command is pending static const DeviceCommandId_t NONE = 0x0; // Set when no command is pending
static const DeviceCommandId_t WRITE_SETUP = 1; static const DeviceCommandId_t WRITE_SETUP = 1;

View File

@ -1,6 +1,7 @@
#include "SusFdir.h" #include "SusFdir.h"
#include "eive/objects.h" #include "eive/objects.h"
#include "mission/acs/susMax1227Helpers.h"
SusFdir::SusFdir(object_id_t sensorId) SusFdir::SusFdir(object_id_t sensorId)
: DeviceHandlerFailureIsolation(sensorId, objects::SUS_BOARD_ASS) {} : DeviceHandlerFailureIsolation(sensorId, objects::SUS_BOARD_ASS) {}

2
tmtc

@ -1 +1 @@
Subproject commit 957d756d1e0c9862ae18798b4c27f8f0da8a349b Subproject commit d285b1caeca01709004983a4d2e795898325effb