reworked watchdog

This commit is contained in:
Robin Müller 2023-02-23 23:56:11 +01:00
parent e33a0fd60b
commit f789380343
No known key found for this signature in database
GPG Key ID: 11D4952C8CCEF814
14 changed files with 470 additions and 366 deletions

View File

@ -16,6 +16,18 @@ will consitute of a breaking change warranting a new major release:
# [unreleased]
## Changed
- Improved the OBSW watchdog by adding a watch functionality. The watch functionality is optional
and has to be enabled specifically by the application being watched by the watchdog when
starting the watchdog. If the watch functionality is enabled and the OBSW has not pinged
the watchdog via the FIFO for 2 minutes, the watchdog will restart the OBSW service via systemd.
The primary OBSW will only activate the watch functionality if it is the OBSW inside the
`/usr/bin` directory. This allows debugging the system by leaving flashed or manually copied
debugging images 2 minutes to start the watchdog without the watch functionality.
- The SD card prefix is now set earlier inside the `CoreController` constructor
- The watchdog handling was moved outside the `CoreController` into the main loop.
# [v1.31.1]
## Fixed

View File

@ -1,4 +1,4 @@
target_sources(${OBSW_NAME} PRIVATE CoreController.cpp scheduling.cpp
ObjectFactory.cpp)
ObjectFactory.cpp WatchdogHandler.cpp)
target_sources(${SIMPLE_OBSW_NAME} PRIVATE scheduling.cpp)

View File

@ -33,12 +33,7 @@ xsc::Copy CoreController::CURRENT_COPY = xsc::Copy::NO_COPY;
CoreController::CoreController(object_id_t objectId)
: ExtendedControllerBase(objectId, 5), opDivider5(5), opDivider10(10), hkSet(this) {
ReturnValue_t result = returnvalue::OK;
try {
result = initWatchdogFifo();
if (result != returnvalue::OK) {
sif::warning << "CoreController::CoreController: Watchdog FIFO init failed" << std::endl;
}
sdcMan = SdCardManager::instance();
if (sdcMan == nullptr) {
sif::error << "CoreController::CoreController: SD card manager invalid!" << std::endl;
@ -47,11 +42,25 @@ CoreController::CoreController(object_id_t objectId)
if (not BLOCKING_SD_INIT) {
sdcMan->setBlocking(false);
}
auto sdCard = sdcMan->getPreferredSdCard();
if (not sdCard.has_value()) {
sif::error << "CoreController::initializeAfterTaskCreation: "
"Issues getting preferred SD card, setting to 0"
<< std::endl;
sdCard = sd::SdCard::SLOT_0;
}
sdInfo.active = sdCard.value();
sdcMan->setActiveSdCard(sdInfo.active);
currMntPrefix = sdcMan->getCurrentMountPrefix();
getCurrentBootCopy(CURRENT_CHIP, CURRENT_COPY);
} catch (const std::filesystem::filesystem_error &e) {
sif::error << "CoreController::CoreController: Failed with exception " << e.what() << std::endl;
}
// Add script folder to path
char *currentEnvPath = getenv("PATH");
std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin";
setenv("PATH", updatedEnvPath.c_str(), true);
sdCardCheckCd.timeOut();
eventQueue = QueueFactory::instance()->createMessageQueue(5, EventMessage::MAX_MESSAGE_SIZE);
}
@ -76,7 +85,6 @@ void CoreController::performControlOperation() {
}
}
}
performWatchdogControlOperation();
sdStateMachine();
performMountedSdCardOperations();
if (sdCardCheckCd.hasTimedOut()) {
@ -146,19 +154,6 @@ ReturnValue_t CoreController::initialize() {
ReturnValue_t CoreController::initializeAfterTaskCreation() {
ReturnValue_t result = returnvalue::OK;
auto sdCard = sdcMan->getPreferredSdCard();
if (not sdCard) {
return returnvalue::FAILED;
}
sdInfo.active = sdCard.value();
if (sdInfo.active == sd::SdCard::NONE) {
sif::error << "CoreController::initializeAfterTaskCreation: "
"Issues getting preferred SD card, setting to 0"
<< std::endl;
sdInfo.active = sd::SdCard::SLOT_0;
}
sdcMan->setActiveSdCard(sdInfo.active);
currMntPrefix = sdcMan->getCurrentMountPrefix();
if (BLOCKING_SD_INIT) {
result = initSdCardBlocking();
if (result != returnvalue::OK and result != SdCardManager::ALREADY_MOUNTED) {
@ -170,12 +165,7 @@ ReturnValue_t CoreController::initializeAfterTaskCreation() {
if (result != returnvalue::OK) {
sif::warning << "CoreController::initialize: Version initialization failed" << std::endl;
}
// Add script folder to path
char *currentEnvPath = getenv("PATH");
std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin";
setenv("PATH", updatedEnvPath.c_str(), true);
updateProtInfo();
initPrint();
return ExtendedControllerBase::initializeAfterTaskCreation();
}
@ -839,36 +829,6 @@ void CoreController::getCurrentBootCopy(xsc::Chip &chip, xsc::Copy &copy) {
copy = static_cast<xsc::Copy>(xscCopy);
}
ReturnValue_t CoreController::initWatchdogFifo() {
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
// Still return returnvalue::OK for now
sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate"
<< " watchdog" << std::endl;
return returnvalue::OK;
}
// Open FIFO write only and non-blocking to prevent SW from killing itself.
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno
<< ": " << strerror(errno) << std::endl;
return returnvalue::FAILED;
}
}
return returnvalue::OK;
}
void CoreController::initPrint() {
#if OBSW_VERBOSE_LEVEL >= 1
if (watchdogFifoFd > 0) {
sif::info << "Opened watchdog FIFO successfully.." << std::endl;
}
#endif
}
ReturnValue_t CoreController::actionXscReboot(const uint8_t *data, size_t size) {
if (size < 1) {
return HasActionsIF::INVALID_PARAMETERS;
@ -1223,36 +1183,6 @@ ReturnValue_t CoreController::handleProtInfoUpdateLine(std::string nextLine) {
return returnvalue::OK;
}
void CoreController::performWatchdogControlOperation() {
// Only perform each fifth iteration
if (watchdogFifoFd != 0 and opDivider5.check()) {
if (watchdogFifoFd == RETRY_FIFO_OPEN) {
// Open FIFO write only and non-blocking
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
// No printout for now, would be spam
return;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with "
<< errno << ": " << strerror(errno) << std::endl;
return;
}
}
sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl;
} else if (watchdogFifoFd > 0) {
// Write to OBSW watchdog FIFO here
const char writeChar = 'a';
ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1);
if (writtenBytes < 0) {
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
<< std::endl;
}
}
}
}
void CoreController::performMountedSdCardOperations() {
auto mountedSdCardOp = [&](sd::SdCard sdCard, std::string mntPoint) {
if (not performOneShotSdCardOpsSwitch) {

View File

@ -162,9 +162,6 @@ class CoreController : public ExtendedControllerBase {
private:
static constexpr MutexIF::TimeoutType TIMEOUT_TYPE = MutexIF::TimeoutType::WAITING;
static constexpr uint32_t MUTEX_TIMEOUT = 20;
// Designated value for rechecking FIFO open
static constexpr int RETRY_FIFO_OPEN = -2;
int watchdogFifoFd = 0;
GpsHyperion::FixMode gpsFix = GpsHyperion::FixMode::UNKNOWN;
// States for SD state machine, which is used in non-blocking mode
@ -260,7 +257,6 @@ class CoreController : public ExtendedControllerBase {
ReturnValue_t performSdCardCheck();
ReturnValue_t timeFileHandler();
ReturnValue_t initBootCopyFile();
ReturnValue_t initWatchdogFifo();
ReturnValue_t initSdCardBlocking();
bool startSdStateMachine(sd::SdCard targetActiveSd, SdCfgMode mode, MessageQueueId_t commander,
DeviceCommandId_t actionId);
@ -285,8 +281,6 @@ class CoreController : public ExtendedControllerBase {
ReturnValue_t gracefulShutdownTasks(xsc::Chip chip, xsc::Copy copy, bool& protOpPerformed);
void performWatchdogControlOperation();
ReturnValue_t handleProtInfoUpdateLine(std::string nextLine);
int handleBootCopyProtAtIndex(xsc::Chip targetChip, xsc::Copy targetCopy, bool protect,
bool& protOperationPerformed, bool selfChip, bool selfCopy,

View File

@ -0,0 +1,84 @@
#include "WatchdogHandler.h"
#include <fcntl.h>
#include <unistd.h>
#include <cerrno>
#include <cstring>
#include <filesystem>
#include "fsfw/serviceinterface.h"
#include "watchdog/definitions.h"
WatchdogHandler::WatchdogHandler() {}
void WatchdogHandler::periodicOperation() {
if (watchdogFifoFd != 0) {
if (watchdogFifoFd == RETRY_FIFO_OPEN) {
// Open FIFO write only and non-blocking
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
// No printout for now, would be spam
return;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with "
<< errno << ": " << strerror(errno) << std::endl;
return;
}
}
sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl;
performStartHandling();
} else if (watchdogFifoFd > 0) {
// Write to OBSW watchdog FIFO here
const char writeChar = watchdog::first::IDLE_CHAR;
ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1);
if (writtenBytes < 0) {
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
<< std::endl;
}
}
}
}
ReturnValue_t WatchdogHandler::initialize(bool enableWatchdogFunction) {
using namespace std::filesystem;
this->enableWatchFunction = enableWatchdogFunction;
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
// Still return returnvalue::OK for now
sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate"
<< " watchdog" << std::endl;
return returnvalue::OK;
}
// Open FIFO write only and non-blocking to prevent SW from killing itself.
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno
<< ": " << strerror(errno) << std::endl;
return returnvalue::FAILED;
}
}
return performStartHandling();
}
ReturnValue_t WatchdogHandler::performStartHandling() {
char startBuf[2];
size_t writeLen = 1;
startBuf[0] = watchdog::first::START_CHAR;
if (enableWatchFunction) {
writeLen += 1;
startBuf[1] = watchdog::second::WATCH_FLAG;
}
ssize_t writtenBytes = write(watchdogFifoFd, &startBuf, writeLen);
if (writtenBytes < 0) {
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
<< std::endl;
return returnvalue::FAILED;
}
return returnvalue::OK;
}

View File

@ -0,0 +1,23 @@
#ifndef BSP_Q7S_CORE_WATCHDOGHANDLER_H_
#define BSP_Q7S_CORE_WATCHDOGHANDLER_H_
#include "fsfw/returnvalues/returnvalue.h"
class WatchdogHandler {
public:
WatchdogHandler();
ReturnValue_t initialize(bool enableWatchFunction);
void periodicOperation();
private:
// Designated value for rechecking FIFO open
static constexpr int RETRY_FIFO_OPEN = -2;
int watchdogFifoFd = 0;
bool enableWatchFunction = false;
ReturnValue_t performStartHandling();
};
#endif /* BSP_Q7S_CORE_WATCHDOGHANDLER_H_ */

View File

@ -9,6 +9,7 @@
#include <iostream>
#include "OBSWConfig.h"
#include "bsp_q7s/core/WatchdogHandler.h"
#include "commonConfig.h"
#include "core/scheduling.h"
#include "fsfw/tasks/TaskFactory.h"
@ -24,6 +25,9 @@ static const char* DEV_STRING = "Xiphos Q7S FM";
#else
static const char* DEV_STRING = "Xiphos Q7S EM";
#endif
WatchdogHandler WATCHDOG_HANDLER;
int obsw::obsw() {
using namespace fsfw;
std::cout << "-- EIVE OBSW --" << std::endl;
@ -44,6 +48,35 @@ int obsw::obsw() {
}
#endif
// Delay the boot if applicable.
bootDelayHandling();
bool initWatchFunction = false;
if (std::filesystem::current_path() == "/usr/bin") {
initWatchFunction = true;
}
ReturnValue_t result = WATCHDOG_HANDLER.initialize(initWatchFunction);
if (result != returnvalue::OK) {
std::cerr << "Initiating EIVE watchdog handler failed" << std::endl;
}
scheduling::initMission();
// Command the EIVE system to safe mode
#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1
commandEiveSystemToSafe();
#else
announceAllModes();
#endif
for (;;) {
WATCHDOG_HANDLER.periodicOperation();
TaskFactory::delayTask(1000);
}
return 0;
}
void obsw::bootDelayHandling() {
const char* homedir = nullptr;
homedir = getenv("HOME");
if (homedir == nullptr) {
@ -71,31 +104,26 @@ int obsw::obsw() {
std::cout << "Delaying OBSW start for " << bootDelaySecs << " seconds" << std::endl;
TaskFactory::delayTask(bootDelaySecs * 1000);
}
}
scheduling::initMission();
// Command the EIVE system to safe mode
void obsw::commandEiveSystemToSafe() {
auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue();
CommandMessage msg;
#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1
ModeMessage::setCmdModeMessage(msg, acs::AcsMode::SAFE, 0);
ReturnValue_t result =
MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false);
if (result != returnvalue::OK) {
sif::error << "Sending safe mode command to EIVE system failed" << std::endl;
}
#else
}
void obsw::announceAllModes() {
auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue();
CommandMessage msg;
ModeMessage::setModeAnnounceMessage(msg, true);
ReturnValue_t result =
MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false);
if (result != returnvalue::OK) {
sif::error << "Sending safe mode command to EIVE system failed" << std::endl;
}
#endif
for (;;) {
/* Suspend main thread by sleeping it. */
TaskFactory::delayTask(5000);
}
return 0;
}

View File

@ -5,6 +5,10 @@ namespace obsw {
int obsw();
};
void bootDelayHandling();
void commandEiveSystemToSafe();
void announceAllModes();
}; // namespace obsw
#endif /* BSP_Q7S_CORE_OBSW_H_ */

View File

@ -4,6 +4,7 @@ if [[ ! -f README.md ]]; then
fi
folder_list=(
"./watchdog"
"./mission"
"./linux"
"./bsp_q7s"

View File

@ -1,10 +1,5 @@
target_sources(${WATCHDOG_NAME} PRIVATE
main.cpp
Watchdog.cpp
)
target_sources(${WATCHDOG_NAME} PRIVATE main.cpp Watchdog.cpp)
target_include_directories(${WATCHDOG_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
)
target_include_directories(${WATCHDOG_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
install(TARGETS ${WATCHDOG_NAME} RUNTIME DESTINATION bin)

View File

@ -1,96 +1,61 @@
#include "Watchdog.h"
#include "definitions.h"
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <poll.h>
#include <unistd.h>
#include <fcntl.h>
#include <poll.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <thread>
#include <cstdlib>
#include <cstring>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <thread>
#include "definitions.h"
WatchdogTask::WatchdogTask (): fd(0) {
WatchdogTask::WatchdogTask() : fd(0) {
int result = 0;
// Only create the FIFO if it does not exist yet
if(not std::filesystem::exists(watchdog::FIFO_NAME)) {
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
// Permission 666 or rw-rw-rw-
mode_t mode = DEFFILEMODE;
result = mkfifo(watchdog::FIFO_NAME.c_str(), mode);
if(result != 0) {
std::cerr << "eive-watchdog: Could not created named pipe at " <<
watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) <<
std::endl;
if (result != 0) {
std::cerr << "eive-watchdog: Could not created named pipe at " << watchdog::FIFO_NAME
<< ", error " << errno << ": " << strerror(errno) << std::endl;
throw std::runtime_error("eive-watchdog: FIFO creation failed");
}
#if WATCHDOG_VERBOSE_LEVEL >= 1
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME <<
" created successfully" << std::endl;
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << " created successfully"
<< std::endl;
#endif
}
}
WatchdogTask::~WatchdogTask() {
}
WatchdogTask::~WatchdogTask() {}
int WatchdogTask::performOperation() {
// Open FIFO read only and non-blocking
fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK);
if(fd < 0) {
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME <<
"read-only failed with " << errno << ": " << strerror(errno) << std::endl;
if (fd < 0) {
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << "read-only failed with "
<< errno << ": " << strerror(errno) << std::endl;
return -1;
}
state = States::RUNNING;
state = States::NOT_STARTED;
while(true) {
while (true) {
WatchdogTask::LoopResult loopResult = watchdogLoop();
switch(loopResult) {
case(LoopResult::OK): {
performRunningOperation();
if (not stateMachine(loopResult)) {
break;
}
case(LoopResult::CANCEL_RQ): {
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
return 0;
}
case(LoopResult::SUSPEND_RQ): {
performSuspendOperation();
break;
}
case(LoopResult::TIMEOUT): {
performNotRunningOperation(loopResult);
break;
}
case(LoopResult::HUNG_UP): {
performNotRunningOperation(loopResult);
break;
}
case(LoopResult::RESTART_RQ): {
if(state == States::SUSPENDED or state == States::FAULTY) {
performRunningOperation();
}
break;
}
case(LoopResult::FAULT): {
using namespace std::chrono_literals;
// Configuration error
std::cerr << "Fault has occured in watchdog loop" << std::endl;
// Prevent spam
std::this_thread::sleep_for(2000ms);
}
}
}
if (close(fd) < 0) {
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME <<
"failed, error " << errno << ": " << strerror(errno) << std::endl;
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << "failed, error "
<< errno << ": " << strerror(errno) << std::endl;
}
std::cout << "eive-watchdog: Finished" << std::endl;
return 0;
@ -102,38 +67,17 @@ WatchdogTask::LoopResult WatchdogTask::watchdogLoop() {
waiter.fd = fd;
waiter.events = POLLIN;
switch(state) {
case(States::SUSPENDED): {
// Sleep, then check whether a restart request was received
std::this_thread::sleep_for(1000ms);
break;
}
case(States::RUNNING): {
// Continue as usual
break;
}
case(States::NOT_STARTED): {
// This should not happen
std::cerr << "eive-watchdog: State is NOT_STARTED, configuration error" << std::endl;
break;
}
case(States::FAULTY): {
// TODO: Not sure what to do yet. Continue for now
break;
}
}
// 10 seconds timeout, only poll one file descriptor
switch(poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
case(0): {
// Only poll one file descriptor with timeout
switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
case (0): {
return LoopResult::TIMEOUT;
}
case(1): {
case (1): {
return pollEvent(waiter);
}
default: {
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error " <<
errno << ": " << strerror(errno) << std::endl;
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error "
<< errno << ": " << strerror(errno) << std::endl;
break;
}
}
@ -144,58 +88,55 @@ WatchdogTask::LoopResult WatchdogTask::pollEvent(struct pollfd& waiter) {
if (waiter.revents & POLLIN) {
ssize_t readLen = read(fd, buf.data(), buf.size());
if (readLen < 0) {
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME <<
", error " << errno << ": " << strerror(errno) << std::endl;
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << ", error "
<< errno << ": " << strerror(errno) << std::endl;
return LoopResult::OK;
}
#if WATCHDOG_VERBOSE_LEVEL == 2
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME
<< std::endl;
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME << std::endl;
#endif
else if(readLen >= 1) {
return parseCommandByte(readLen);
else if (readLen >= 1) {
return parseCommand(readLen);
}
}
else if(waiter.revents & POLLERR) {
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME <<
std::endl;
} else if (waiter.revents & POLLERR) {
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME << std::endl;
return LoopResult::FAULT;
}
else if (waiter.revents & POLLHUP) {
} else if (waiter.revents & POLLHUP) {
// Writer closed its end
return LoopResult::HUNG_UP;
}
return LoopResult::FAULT;
}
WatchdogTask::LoopResult WatchdogTask::parseCommandByte(ssize_t readLen) {
for(ssize_t idx = 0; idx < readLen; idx++) {
char readChar = buf[idx];
WatchdogTask::LoopResult WatchdogTask::parseCommand(ssize_t readLen) {
char readChar = buf[0];
// Cancel request
if(readChar == watchdog::CANCEL_CHAR) {
return LoopResult::CANCEL_RQ;
}
// Begin request. Does not work if the operation was not suspended before
else if(readChar == watchdog::RESTART_CHAR) {
return LoopResult::RESTART_RQ;
}
if (readChar == watchdog::first::CANCEL_CHAR) {
return LoopResult::CANCEL_REQ;
} else if (readChar == watchdog::first::SUSPEND_CHAR) {
// Suspend request
else if(readChar == watchdog::SUSPEND_CHAR) {
return LoopResult::SUSPEND_RQ;
return LoopResult::SUSPEND_REQ;
} else if (readChar == watchdog::first::START_CHAR) {
if (readLen == 2 and static_cast<char>(buf[1]) == watchdog::second::WATCH_FLAG) {
return LoopResult::START_WITH_WATCH_REQ;
}
return LoopResult::START_REQ;
}
// Everything else: All working as expected
}
return LoopResult::OK;
}
int WatchdogTask::performRunningOperation() {
if(state != States::RUNNING) {
if (state != States::RUNNING) {
state = States::RUNNING;
}
if (notRunningStart.has_value()) {
notRunningStart = std::nullopt;
}
if(not obswRunning) {
if(printNotRunningLatch) {
if (not obswRunning) {
if (printNotRunningLatch) {
// Reset latch so user can see timeouts
printNotRunningLatch = false;
}
@ -206,9 +147,8 @@ int WatchdogTask::performRunningOperation() {
std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl;
if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME);
if(not obswRunningFile.good()) {
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed"
<< std::endl;
if (not obswRunningFile.good()) {
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl;
}
}
#endif
@ -218,29 +158,41 @@ int WatchdogTask::performRunningOperation() {
int WatchdogTask::performNotRunningOperation(LoopResult type) {
// Latch prevents spam on console
if(not printNotRunningLatch) {
if(type == LoopResult::HUNG_UP) {
if (not printNotRunningLatch) {
if (type == LoopResult::HUNG_UP) {
std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl;
}
else {
} else {
std::cout << "eive-watchdog: The FIFO timed out!" << std::endl;
}
printNotRunningLatch = true;
}
if(obswRunning) {
if (not notRunningStart.has_value()) {
notRunningStart = std::chrono::system_clock::now();
}
if (obswRunning) {
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str());
if(result != 0) {
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " <<
errno << ": " << strerror(errno) << std::endl;
if (result != 0) {
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno
<< ": " << strerror(errno) << std::endl;
}
}
#endif
obswRunning = false;
}
if(type == LoopResult::HUNG_UP) {
if (watchingObsw) {
auto timeNotRunning = std::chrono::system_clock::now() - notRunningStart.value();
if (std::chrono::duration_cast<std::chrono::milliseconds>(timeNotRunning).count() >
watchdog::MAX_NOT_RUNNING_MS) {
std::cout << "Restarting OBSW" << std::endl;
std::system("systemctl restart obsw");
}
}
if (type == LoopResult::HUNG_UP) {
using namespace std::chrono_literals;
// Prevent spam
std::this_thread::sleep_for(2000ms);
@ -248,11 +200,77 @@ int WatchdogTask::performNotRunningOperation(LoopResult type) {
return 0;
}
int WatchdogTask::performSuspendOperation() {
if(state == States::RUNNING or state == States::FAULTY) {
bool WatchdogTask::stateMachine(LoopResult loopResult) {
using namespace std::chrono_literals;
bool sleep = false;
switch (state) {
case (States::RUNNING): {
switch (loopResult) {
case (LoopResult::TIMEOUT):
case (LoopResult::HUNG_UP): {
performNotRunningOperation(loopResult);
break;
}
case (LoopResult::OK): {
performRunningOperation();
break;
}
case (LoopResult::SUSPEND_REQ): {
if (state == States::RUNNING or state == States::FAULTY) {
std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl;
watchdogRunning = false;
state = States::SUSPENDED;
}
return 0;
performSuspendOperation();
sleep = true;
break;
}
case (LoopResult::CANCEL_REQ): {
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
return false;
}
}
}
case (States::FAULTY):
case (States::SUSPENDED):
case (States::NOT_STARTED): {
switch (loopResult) {
case (LoopResult::SUSPEND_REQ): {
// Ignore and also delay
sleep = true;
break;
}
case (LoopResult::START_REQ):
case (LoopResult::START_WITH_WATCH_REQ): {
if (state == States::NOT_STARTED or state == States::FAULTY) {
state = States::RUNNING;
}
std::cout << "Watch request received. Restarting OBSW if not running for "
<< watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl;
if (loopResult == LoopResult::START_REQ) {
watchingObsw = false;
} else if (loopResult == LoopResult::START_WITH_WATCH_REQ) {
watchingObsw = true;
}
performRunningOperation();
break;
}
default: {
sleep = true;
}
}
break;
}
}
if (loopResult == LoopResult::FAULT) {
// Configuration error
std::cerr << "Fault has occured in watchdog loop" << std::endl;
// Prevent spam
sleep = true;
}
if (sleep) {
std::this_thread::sleep_for(1000ms);
}
return true;
}
int WatchdogTask::performSuspendOperation() { return 0; }

View File

@ -2,23 +2,21 @@
#define WATCHDOG_WATCHDOG_H_
#include <array>
#include <chrono>
#include <cstdint>
#include <optional>
#include <string>
class WatchdogTask {
public:
enum class States {
NOT_STARTED,
RUNNING,
SUSPENDED,
FAULTY
};
public:
enum class States { NOT_STARTED, RUNNING, SUSPENDED, FAULTY };
enum class LoopResult {
OK,
SUSPEND_RQ,
CANCEL_RQ,
RESTART_RQ,
START_REQ,
START_WITH_WATCH_REQ,
SUSPEND_REQ,
CANCEL_REQ,
TIMEOUT,
HUNG_UP,
FAULT
@ -29,18 +27,23 @@ public:
virtual ~WatchdogTask();
int performOperation();
private:
private:
int fd = 0;
bool obswRunning = false;
bool watchdogRunning = false;
bool watchingObsw = false;
bool printNotRunningLatch = false;
std::array<uint8_t, 64> buf;
std::optional<std::chrono::time_point<std::chrono::system_clock>> notRunningStart;
States state = States::NOT_STARTED;
// Primary loop. Takes care of delaying, and reading from the communication pipe and translating
// messages to loop results.
LoopResult watchdogLoop();
bool stateMachine(LoopResult result);
LoopResult pollEvent(struct pollfd& waiter);
LoopResult parseCommandByte(ssize_t readLen);
LoopResult parseCommand(ssize_t readLen);
int performRunningOperation();
int performNotRunningOperation(LoopResult type);

View File

@ -5,17 +5,31 @@
namespace watchdog {
namespace first {
// Start or restart character
static constexpr char START_CHAR = 'b';
// Suspend watchdog operations temporarily
static constexpr char SUSPEND_CHAR = 's';
// Resume watchdog operations
static constexpr char RESTART_CHAR = 'b';
// Causes the watchdog to close down
static constexpr char CANCEL_CHAR = 'c';
static constexpr char IDLE_CHAR = 'i';
} // namespace first
namespace second {
// Supplied with the start character. This will instruct the watchdog to actually watch
// the OBSW is runnng all the time.
static constexpr char WATCH_FLAG = 'w';
} // namespace second
static constexpr int TIMEOUT_MS = 5 * 1000;
// 2 minutes
static constexpr unsigned MAX_NOT_RUNNING_MS = 2 * 60 * 1000;
const std::string FIFO_NAME = "/tmp/watchdog-pipe";
const std::string RUNNING_FILE_NAME = "/tmp/obsw-running";
}
} // namespace watchdog
#endif /* WATCHDOG_DEFINITIONS_H_ */

View File

@ -1,7 +1,7 @@
#include "Watchdog.h"
#include <iostream>
#include "Watchdog.h"
/**
* @brief This watchdog application uses a FIFO to check whether the OBSW is still running.
* It checks whether the OBSW writes to the the FIFO regularly.
@ -11,14 +11,12 @@ int main() {
try {
WatchdogTask watchdogTask;
int result = watchdogTask.performOperation();
if(result != 0) {
if (result != 0) {
return result;
}
}
catch(const std::runtime_error& e) {
} catch (const std::runtime_error& e) {
std::cerr << "eive-watchdog: Run time exception " << e.what() << std::endl;
return -1;
}
return 0;
}