took over dHB failure isolation base fixes

This commit is contained in:
Robin Müller 2020-08-27 20:00:36 +02:00
parent 4d1b1ba506
commit a6e99e443a
2 changed files with 304 additions and 305 deletions

View File

@ -1,251 +1,251 @@
#include "../devicehandlers/DeviceHandlerBase.h" #include "DeviceHandlerBase.h"
#include "../devicehandlers/DeviceHandlerFailureIsolation.h" #include "DeviceHandlerFailureIsolation.h"
#include "../health/HealthTableIF.h" #include "../health/HealthTableIF.h"
#include "../power/Fuse.h" #include "../power/Fuse.h"
#include "../serviceinterface/ServiceInterfaceStream.h" #include "../serviceinterface/ServiceInterfaceStream.h"
#include "../thermal/ThermalComponentIF.h" #include "../thermal/ThermalComponentIF.h"
object_id_t DeviceHandlerFailureIsolation::powerConfirmationId = 0; object_id_t DeviceHandlerFailureIsolation::powerConfirmationId =
objects::NO_OBJECT;
DeviceHandlerFailureIsolation::DeviceHandlerFailureIsolation(object_id_t owner,
object_id_t parent) : DeviceHandlerFailureIsolation::DeviceHandlerFailureIsolation(object_id_t owner,
FailureIsolationBase(owner, parent), object_id_t parent) :
strangeReplyCount(MAX_STRANGE_REPLIES, STRANGE_REPLIES_TIME_MS, FailureIsolationBase(owner, parent),
parameterDomainBase++), strangeReplyCount(MAX_STRANGE_REPLIES, STRANGE_REPLIES_TIME_MS,
missedReplyCount( MAX_MISSED_REPLY_COUNT, MISSED_REPLY_TIME_MS, parameterDomainBase++),
parameterDomainBase++), missedReplyCount( MAX_MISSED_REPLY_COUNT, MISSED_REPLY_TIME_MS,
recoveryCounter(MAX_REBOOT, REBOOT_TIME_MS, parameterDomainBase++), parameterDomainBase++),
fdirState(NONE), powerConfirmation(0) { recoveryCounter(MAX_REBOOT, REBOOT_TIME_MS, parameterDomainBase++),
} fdirState(NONE), powerConfirmation(0) {
}
DeviceHandlerFailureIsolation::~DeviceHandlerFailureIsolation() {
} DeviceHandlerFailureIsolation::~DeviceHandlerFailureIsolation() {
}
ReturnValue_t DeviceHandlerFailureIsolation::eventReceived(EventMessage* event) {
if(isFdirInActionOrAreWeFaulty(event)) { ReturnValue_t DeviceHandlerFailureIsolation::eventReceived(EventMessage* event) {
return RETURN_OK; if(isFdirInActionOrAreWeFaulty(event)) {
} return RETURN_OK;
ReturnValue_t result = RETURN_FAILED; }
switch (event->getEvent()) { ReturnValue_t result = RETURN_FAILED;
case HasModesIF::MODE_TRANSITION_FAILED: switch (event->getEvent()) {
case HasModesIF::OBJECT_IN_INVALID_MODE: case HasModesIF::MODE_TRANSITION_FAILED:
//We'll try a recovery as long as defined in MAX_REBOOT. case HasModesIF::OBJECT_IN_INVALID_MODE:
//Might cause some AssemblyBase cycles, so keep number low. //We'll try a recovery as long as defined in MAX_REBOOT.
handleRecovery(event->getEvent()); //Might cause some AssemblyBase cycles, so keep number low.
break; handleRecovery(event->getEvent());
case DeviceHandlerIF::DEVICE_INTERPRETING_REPLY_FAILED: break;
case DeviceHandlerIF::DEVICE_READING_REPLY_FAILED: case DeviceHandlerIF::DEVICE_INTERPRETING_REPLY_FAILED:
case DeviceHandlerIF::DEVICE_UNREQUESTED_REPLY: case DeviceHandlerIF::DEVICE_READING_REPLY_FAILED:
case DeviceHandlerIF::DEVICE_UNKNOWN_REPLY: //Some DH's generate generic reply-ids. case DeviceHandlerIF::DEVICE_UNREQUESTED_REPLY:
case DeviceHandlerIF::DEVICE_BUILDING_COMMAND_FAILED: case DeviceHandlerIF::DEVICE_UNKNOWN_REPLY: //Some DH's generate generic reply-ids.
//These faults all mean that there were stupid replies from a device. case DeviceHandlerIF::DEVICE_BUILDING_COMMAND_FAILED:
if (strangeReplyCount.incrementAndCheck()) { //These faults all mean that there were stupid replies from a device.
handleRecovery(event->getEvent()); if (strangeReplyCount.incrementAndCheck()) {
} handleRecovery(event->getEvent());
break; }
case DeviceHandlerIF::DEVICE_SENDING_COMMAND_FAILED: break;
case DeviceHandlerIF::DEVICE_REQUESTING_REPLY_FAILED: case DeviceHandlerIF::DEVICE_SENDING_COMMAND_FAILED:
//The two above should never be confirmed. case DeviceHandlerIF::DEVICE_REQUESTING_REPLY_FAILED:
case DeviceHandlerIF::DEVICE_MISSED_REPLY: //The two above should never be confirmed.
result = sendConfirmationRequest(event); case DeviceHandlerIF::DEVICE_MISSED_REPLY:
if (result == HasReturnvaluesIF::RETURN_OK) { result = sendConfirmationRequest(event);
break; if (result == HasReturnvaluesIF::RETURN_OK) {
} break;
//else }
if (missedReplyCount.incrementAndCheck()) { //else
handleRecovery(event->getEvent()); if (missedReplyCount.incrementAndCheck()) {
} handleRecovery(event->getEvent());
break; }
case StorageManagerIF::GET_DATA_FAILED: break;
case StorageManagerIF::STORE_DATA_FAILED: case StorageManagerIF::GET_DATA_FAILED:
//Rather strange bugs, occur in RAW mode only. Ignore. case StorageManagerIF::STORE_DATA_FAILED:
break; //Rather strange bugs, occur in RAW mode only. Ignore.
case DeviceHandlerIF::INVALID_DEVICE_COMMAND: break;
//Ignore, is bad configuration. We can't do anything in flight. case DeviceHandlerIF::INVALID_DEVICE_COMMAND:
break; //Ignore, is bad configuration. We can't do anything in flight.
case HasHealthIF::HEALTH_INFO: break;
case HasModesIF::MODE_INFO: case HasHealthIF::HEALTH_INFO:
case HasModesIF::CHANGING_MODE: case HasModesIF::MODE_INFO:
//Do nothing, but mark as handled. case HasModesIF::CHANGING_MODE:
break; //Do nothing, but mark as handled.
//****Power***** break;
case PowerSwitchIF::SWITCH_WENT_OFF: //****Power*****
if(hasPowerConfirmation) { case PowerSwitchIF::SWITCH_WENT_OFF:
result = sendConfirmationRequest(event, powerConfirmation); if(powerConfirmation != MessageQueueIF::NO_QUEUE) {
if (result == RETURN_OK) { result = sendConfirmationRequest(event, powerConfirmation);
setFdirState(DEVICE_MIGHT_BE_OFF); if (result == RETURN_OK) {
} setFdirState(DEVICE_MIGHT_BE_OFF);
} }
break; }
case Fuse::FUSE_WENT_OFF: break;
//Not so good, because PCDU reacted. case Fuse::FUSE_WENT_OFF:
case Fuse::POWER_ABOVE_HIGH_LIMIT: //Not so good, because PCDU reacted.
//Better, because software detected over-current. case Fuse::POWER_ABOVE_HIGH_LIMIT:
setFaulty(event->getEvent()); //Better, because software detected over-current.
break; setFaulty(event->getEvent());
case Fuse::POWER_BELOW_LOW_LIMIT: break;
//Device might got stuck during boot, retry. case Fuse::POWER_BELOW_LOW_LIMIT:
handleRecovery(event->getEvent()); //Device might got stuck during boot, retry.
break; handleRecovery(event->getEvent());
//****Thermal***** break;
case ThermalComponentIF::COMPONENT_TEMP_LOW: //****Thermal*****
case ThermalComponentIF::COMPONENT_TEMP_HIGH: case ThermalComponentIF::COMPONENT_TEMP_LOW:
case ThermalComponentIF::COMPONENT_TEMP_OOL_LOW: case ThermalComponentIF::COMPONENT_TEMP_HIGH:
case ThermalComponentIF::COMPONENT_TEMP_OOL_HIGH: case ThermalComponentIF::COMPONENT_TEMP_OOL_LOW:
//Well, the device is not really faulty, but it is required to stay off as long as possible. case ThermalComponentIF::COMPONENT_TEMP_OOL_HIGH:
setFaulty(event->getEvent()); //Well, the device is not really faulty, but it is required to stay off as long as possible.
break; setFaulty(event->getEvent());
case ThermalComponentIF::TEMP_NOT_IN_OP_RANGE: break;
//Ignore, is information only. case ThermalComponentIF::TEMP_NOT_IN_OP_RANGE:
break; //Ignore, is information only.
//*******Default monitoring variables. Are currently not used.***** break;
// case DeviceHandlerIF::MONITORING_LIMIT_EXCEEDED: //*******Default monitoring variables. Are currently not used.*****
// setFaulty(event->getEvent()); // case DeviceHandlerIF::MONITORING_LIMIT_EXCEEDED:
// break; // setFaulty(event->getEvent());
// case DeviceHandlerIF::MONITORING_AMBIGUOUS: // break;
// break; // case DeviceHandlerIF::MONITORING_AMBIGUOUS:
default: // break;
//We don't know the event, someone else should handle it. default:
return RETURN_FAILED; //We don't know the event, someone else should handle it.
} return RETURN_FAILED;
return RETURN_OK; }
} return RETURN_OK;
}
void DeviceHandlerFailureIsolation::eventConfirmed(EventMessage* event) {
switch (event->getEvent()) { void DeviceHandlerFailureIsolation::eventConfirmed(EventMessage* event) {
case DeviceHandlerIF::DEVICE_SENDING_COMMAND_FAILED: switch (event->getEvent()) {
case DeviceHandlerIF::DEVICE_REQUESTING_REPLY_FAILED: case DeviceHandlerIF::DEVICE_SENDING_COMMAND_FAILED:
case DeviceHandlerIF::DEVICE_MISSED_REPLY: case DeviceHandlerIF::DEVICE_REQUESTING_REPLY_FAILED:
if (missedReplyCount.incrementAndCheck()) { case DeviceHandlerIF::DEVICE_MISSED_REPLY:
handleRecovery(event->getEvent()); if (missedReplyCount.incrementAndCheck()) {
} handleRecovery(event->getEvent());
break; }
case PowerSwitchIF::SWITCH_WENT_OFF: break;
//This means the switch went off only for one device. case PowerSwitchIF::SWITCH_WENT_OFF:
handleRecovery(event->getEvent()); //This means the switch went off only for one device.
break; handleRecovery(event->getEvent());
default: break;
break; default:
} break;
} }
}
void DeviceHandlerFailureIsolation::decrementFaultCounters() {
strangeReplyCount.checkForDecrement(); void DeviceHandlerFailureIsolation::decrementFaultCounters() {
missedReplyCount.checkForDecrement(); strangeReplyCount.checkForDecrement();
recoveryCounter.checkForDecrement(); missedReplyCount.checkForDecrement();
} recoveryCounter.checkForDecrement();
}
void DeviceHandlerFailureIsolation::handleRecovery(Event reason) {
clearFaultCounters(); void DeviceHandlerFailureIsolation::handleRecovery(Event reason) {
if (not recoveryCounter.incrementAndCheck()) { clearFaultCounters();
startRecovery(reason); if (not recoveryCounter.incrementAndCheck()) {
} else { startRecovery(reason);
setFaulty(reason); } else {
} setFaulty(reason);
} }
}
void DeviceHandlerFailureIsolation::wasParentsFault(EventMessage* event) {
//We'll better ignore the SWITCH_WENT_OFF event and await a system-wide reset. void DeviceHandlerFailureIsolation::wasParentsFault(EventMessage* event) {
//This means, no fault message will come through until a MODE_ or //We'll better ignore the SWITCH_WENT_OFF event and await a system-wide reset.
//HEALTH_INFO message comes through -> Is that ok? //This means, no fault message will come through until a MODE_ or
//Same issue in TxFailureIsolation! //HEALTH_INFO message comes through -> Is that ok?
// if ((event->getEvent() == PowerSwitchIF::SWITCH_WENT_OFF) //Same issue in TxFailureIsolation!
// && (fdirState != RECOVERY_ONGOING)) { // if ((event->getEvent() == PowerSwitchIF::SWITCH_WENT_OFF)
// setFdirState(NONE); // && (fdirState != RECOVERY_ONGOING)) {
// } // setFdirState(NONE);
} // }
}
void DeviceHandlerFailureIsolation::clearFaultCounters() {
strangeReplyCount.clear(); void DeviceHandlerFailureIsolation::clearFaultCounters() {
missedReplyCount.clear(); strangeReplyCount.clear();
} missedReplyCount.clear();
}
ReturnValue_t DeviceHandlerFailureIsolation::initialize() {
ReturnValue_t result = FailureIsolationBase::initialize(); ReturnValue_t DeviceHandlerFailureIsolation::initialize() {
if (result != HasReturnvaluesIF::RETURN_OK) { ReturnValue_t result = FailureIsolationBase::initialize();
sif::error << "DeviceHandlerFailureIsolation::initialize: Could not" if (result != HasReturnvaluesIF::RETURN_OK) {
" initialize FailureIsolationBase." << std::endl; sif::error << "DeviceHandlerFailureIsolation::initialize: Could not"
return result; " initialize FailureIsolationBase." << std::endl;
} return result;
ConfirmsFailuresIF* power = objectManager->get<ConfirmsFailuresIF>( }
powerConfirmationId); ConfirmsFailuresIF* power = objectManager->get<ConfirmsFailuresIF>(
if (power != nullptr) { powerConfirmationId);
powerConfirmation = power->getEventReceptionQueue(); if (power != nullptr) {
hasPowerConfirmation = true; powerConfirmation = power->getEventReceptionQueue();
} }
return RETURN_OK; return RETURN_OK;
} }
void DeviceHandlerFailureIsolation::setFdirState(FDIRState state) { void DeviceHandlerFailureIsolation::setFdirState(FDIRState state) {
FailureIsolationBase::throwFdirEvent(FDIR_CHANGED_STATE, state, fdirState); FailureIsolationBase::throwFdirEvent(FDIR_CHANGED_STATE, state, fdirState);
fdirState = state; fdirState = state;
} }
void DeviceHandlerFailureIsolation::triggerEvent(Event event, uint32_t parameter1, void DeviceHandlerFailureIsolation::triggerEvent(Event event, uint32_t parameter1,
uint32_t parameter2) { uint32_t parameter2) {
//Do not throw error events if fdirState != none. //Do not throw error events if fdirState != none.
//This will still forward MODE and HEALTH INFO events in any case. //This will still forward MODE and HEALTH INFO events in any case.
if (fdirState == NONE || EVENT::getSeverity(event) == SEVERITY::INFO) { if (fdirState == NONE || EVENT::getSeverity(event) == SEVERITY::INFO) {
FailureIsolationBase::triggerEvent(event, parameter1, parameter2); FailureIsolationBase::triggerEvent(event, parameter1, parameter2);
} }
} }
bool DeviceHandlerFailureIsolation::isFdirActionInProgress() { bool DeviceHandlerFailureIsolation::isFdirActionInProgress() {
return (fdirState != NONE); return (fdirState != NONE);
} }
void DeviceHandlerFailureIsolation::startRecovery(Event reason) { void DeviceHandlerFailureIsolation::startRecovery(Event reason) {
throwFdirEvent(FDIR_STARTS_RECOVERY, EVENT::getEventId(reason)); throwFdirEvent(FDIR_STARTS_RECOVERY, EVENT::getEventId(reason));
setOwnerHealth(HasHealthIF::NEEDS_RECOVERY); setOwnerHealth(HasHealthIF::NEEDS_RECOVERY);
setFdirState(RECOVERY_ONGOING); setFdirState(RECOVERY_ONGOING);
} }
ReturnValue_t DeviceHandlerFailureIsolation::getParameter(uint8_t domainId, ReturnValue_t DeviceHandlerFailureIsolation::getParameter(uint8_t domainId,
uint16_t parameterId, ParameterWrapper* parameterWrapper, uint16_t parameterId, ParameterWrapper* parameterWrapper,
const ParameterWrapper* newValues, uint16_t startAtIndex) { const ParameterWrapper* newValues, uint16_t startAtIndex) {
ReturnValue_t result = strangeReplyCount.getParameter(domainId, parameterId, ReturnValue_t result = strangeReplyCount.getParameter(domainId, parameterId,
parameterWrapper, newValues, startAtIndex); parameterWrapper, newValues, startAtIndex);
if (result != INVALID_DOMAIN_ID) { if (result != INVALID_DOMAIN_ID) {
return result; return result;
} }
result = missedReplyCount.getParameter(domainId, parameterId, result = missedReplyCount.getParameter(domainId, parameterId,
parameterWrapper, newValues, startAtIndex); parameterWrapper, newValues, startAtIndex);
if (result != INVALID_DOMAIN_ID) { if (result != INVALID_DOMAIN_ID) {
return result; return result;
} }
result = recoveryCounter.getParameter(domainId, parameterId, result = recoveryCounter.getParameter(domainId, parameterId,
parameterWrapper, newValues, startAtIndex); parameterWrapper, newValues, startAtIndex);
if (result != INVALID_DOMAIN_ID) { if (result != INVALID_DOMAIN_ID) {
return result; return result;
} }
return INVALID_DOMAIN_ID; return INVALID_DOMAIN_ID;
} }
void DeviceHandlerFailureIsolation::setFaulty(Event reason) { void DeviceHandlerFailureIsolation::setFaulty(Event reason) {
throwFdirEvent(FDIR_TURNS_OFF_DEVICE, EVENT::getEventId(reason)); throwFdirEvent(FDIR_TURNS_OFF_DEVICE, EVENT::getEventId(reason));
setOwnerHealth(HasHealthIF::FAULTY); setOwnerHealth(HasHealthIF::FAULTY);
setFdirState(AWAIT_SHUTDOWN); setFdirState(AWAIT_SHUTDOWN);
} }
bool DeviceHandlerFailureIsolation::isFdirInActionOrAreWeFaulty( bool DeviceHandlerFailureIsolation::isFdirInActionOrAreWeFaulty(
EventMessage* event) { EventMessage* event) {
if (fdirState != NONE) { if (fdirState != NONE) {
//Only wait for those events, ignore all others. //Only wait for those events, ignore all others.
if (event->getParameter1() == HasHealthIF::HEALTHY if (event->getParameter1() == HasHealthIF::HEALTHY
&& event->getEvent() == HasHealthIF::HEALTH_INFO) { && event->getEvent() == HasHealthIF::HEALTH_INFO) {
setFdirState(NONE); setFdirState(NONE);
} }
if (event->getEvent() == HasModesIF::MODE_INFO if (event->getEvent() == HasModesIF::MODE_INFO
&& fdirState != RECOVERY_ONGOING) { && fdirState != RECOVERY_ONGOING) {
setFdirState(NONE); setFdirState(NONE);
} }
return true; return true;
} }
if (owner->getHealth() == HasHealthIF::FAULTY if (owner->getHealth() == HasHealthIF::FAULTY
|| owner->getHealth() == HasHealthIF::PERMANENT_FAULTY) { || owner->getHealth() == HasHealthIF::PERMANENT_FAULTY) {
//Ignore all events in case device is already faulty. //Ignore all events in case device is already faulty.
return true; return true;
} }
return false; return false;
} }

View File

@ -1,54 +1,53 @@
#ifndef FRAMEWORK_DEVICEHANDLERS_DEVICEHANDLERFAILUREISOLATION_H_ #ifndef FRAMEWORK_DEVICEHANDLERS_DEVICEHANDLERFAILUREISOLATION_H_
#define FRAMEWORK_DEVICEHANDLERS_DEVICEHANDLERFAILUREISOLATION_H_ #define FRAMEWORK_DEVICEHANDLERS_DEVICEHANDLERFAILUREISOLATION_H_
#include "../fdir/FaultCounter.h" #include "../fdir/FaultCounter.h"
#include "../fdir/FailureIsolationBase.h" #include "../fdir/FailureIsolationBase.h"
namespace Factory{
void setStaticFrameworkObjectIds(); namespace Factory{
} void setStaticFrameworkObjectIds();
}
class DeviceHandlerFailureIsolation: public FailureIsolationBase { class DeviceHandlerFailureIsolation: public FailureIsolationBase {
friend void (Factory::setStaticFrameworkObjectIds)(); friend void (Factory::setStaticFrameworkObjectIds)();
friend class Heater; friend class Heater;
public: public:
DeviceHandlerFailureIsolation(object_id_t owner, object_id_t parent); DeviceHandlerFailureIsolation(object_id_t owner, object_id_t parent);
~DeviceHandlerFailureIsolation(); ~DeviceHandlerFailureIsolation();
ReturnValue_t initialize(); ReturnValue_t initialize();
void triggerEvent(Event event, uint32_t parameter1 = 0, void triggerEvent(Event event, uint32_t parameter1 = 0,
uint32_t parameter2 = 0);bool isFdirActionInProgress(); uint32_t parameter2 = 0);bool isFdirActionInProgress();
virtual ReturnValue_t getParameter(uint8_t domainId, uint16_t parameterId, virtual ReturnValue_t getParameter(uint8_t domainId, uint16_t parameterId,
ParameterWrapper *parameterWrapper, ParameterWrapper *parameterWrapper,
const ParameterWrapper *newValues, uint16_t startAtIndex); const ParameterWrapper *newValues, uint16_t startAtIndex);
protected: protected:
FaultCounter strangeReplyCount; FaultCounter strangeReplyCount;
FaultCounter missedReplyCount; FaultCounter missedReplyCount;
FaultCounter recoveryCounter; FaultCounter recoveryCounter;
enum FDIRState { enum FDIRState {
NONE, RECOVERY_ONGOING, DEVICE_MIGHT_BE_OFF, AWAIT_SHUTDOWN NONE, RECOVERY_ONGOING, DEVICE_MIGHT_BE_OFF, AWAIT_SHUTDOWN
}; };
FDIRState fdirState; FDIRState fdirState;
bool hasPowerConfirmation = false; MessageQueueId_t powerConfirmation = MessageQueueIF::NO_QUEUE;
MessageQueueId_t powerConfirmation; static object_id_t powerConfirmationId;
static object_id_t powerConfirmationId; // TODO: Are those hardcoded value? How can they be changed.
// TODO: Are those hardcoded value? How can they be changed. static const uint32_t MAX_REBOOT = 1;
static const uint32_t MAX_REBOOT = 1; static const uint32_t REBOOT_TIME_MS = 180000;
static const uint32_t REBOOT_TIME_MS = 180000; static const uint32_t MAX_STRANGE_REPLIES = 10;
static const uint32_t MAX_STRANGE_REPLIES = 10; static const uint32_t STRANGE_REPLIES_TIME_MS = 10000;
static const uint32_t STRANGE_REPLIES_TIME_MS = 10000; static const uint32_t MAX_MISSED_REPLY_COUNT = 5;
static const uint32_t MAX_MISSED_REPLY_COUNT = 5; static const uint32_t MISSED_REPLY_TIME_MS = 10000;
static const uint32_t MISSED_REPLY_TIME_MS = 10000; virtual ReturnValue_t eventReceived(EventMessage* event);
virtual ReturnValue_t eventReceived(EventMessage* event); virtual void eventConfirmed(EventMessage* event);
virtual void eventConfirmed(EventMessage* event); void wasParentsFault(EventMessage* event);
void wasParentsFault(EventMessage* event); void decrementFaultCounters();
void decrementFaultCounters(); void handleRecovery(Event reason);
void handleRecovery(Event reason); virtual void clearFaultCounters();
virtual void clearFaultCounters(); void setFdirState(FDIRState state);
void setFdirState(FDIRState state); void startRecovery(Event reason);
void startRecovery(Event reason); void setFaulty(Event reason);
void setFaulty(Event reason);
bool isFdirInActionOrAreWeFaulty(EventMessage* event);
bool isFdirInActionOrAreWeFaulty(EventMessage* event); };
};
#endif /* FRAMEWORK_DEVICEHANDLERS_DEVICEHANDLERFAILUREISOLATION_H_ */
#endif /* FRAMEWORK_DEVICEHANDLERS_DEVICEHANDLERFAILUREISOLATION_H_ */