From f789380343ac2d9121b76e67e254046bef0e6bac Mon Sep 17 00:00:00 2001 From: Robin Mueller Date: Thu, 23 Feb 2023 23:56:11 +0100 Subject: [PATCH] reworked watchdog --- CHANGELOG.md | 12 + bsp_q7s/core/CMakeLists.txt | 2 +- bsp_q7s/core/CoreController.cpp | 98 +------ bsp_q7s/core/CoreController.h | 6 - bsp_q7s/core/WatchdogHandler.cpp | 84 ++++++ bsp_q7s/core/WatchdogHandler.h | 23 ++ bsp_q7s/obsw.cpp | 52 +++- bsp_q7s/obsw.h | 6 +- scripts/auto-formatter.sh | 1 + watchdog/CMakeLists.txt | 9 +- watchdog/Watchdog.cpp | 428 ++++++++++++++++--------------- watchdog/Watchdog.h | 67 ++--- watchdog/definitions.h | 20 +- watchdog/main.cpp | 28 +- 14 files changed, 470 insertions(+), 366 deletions(-) create mode 100644 bsp_q7s/core/WatchdogHandler.cpp create mode 100644 bsp_q7s/core/WatchdogHandler.h diff --git a/CHANGELOG.md b/CHANGELOG.md index dd893586..701ec784 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,18 @@ will consitute of a breaking change warranting a new major release: # [unreleased] +## Changed + +- Improved the OBSW watchdog by adding a watch functionality. The watch functionality is optional + and has to be enabled specifically by the application being watched by the watchdog when + starting the watchdog. If the watch functionality is enabled and the OBSW has not pinged + the watchdog via the FIFO for 2 minutes, the watchdog will restart the OBSW service via systemd. + The primary OBSW will only activate the watch functionality if it is the OBSW inside the + `/usr/bin` directory. This allows debugging the system by leaving flashed or manually copied + debugging images 2 minutes to start the watchdog without the watch functionality. +- The SD card prefix is now set earlier inside the `CoreController` constructor +- The watchdog handling was moved outside the `CoreController` into the main loop. + # [v1.31.1] ## Fixed diff --git a/bsp_q7s/core/CMakeLists.txt b/bsp_q7s/core/CMakeLists.txt index 15d361fd..b726885b 100644 --- a/bsp_q7s/core/CMakeLists.txt +++ b/bsp_q7s/core/CMakeLists.txt @@ -1,4 +1,4 @@ target_sources(${OBSW_NAME} PRIVATE CoreController.cpp scheduling.cpp - ObjectFactory.cpp) + ObjectFactory.cpp WatchdogHandler.cpp) target_sources(${SIMPLE_OBSW_NAME} PRIVATE scheduling.cpp) diff --git a/bsp_q7s/core/CoreController.cpp b/bsp_q7s/core/CoreController.cpp index d3264b73..8379dad2 100644 --- a/bsp_q7s/core/CoreController.cpp +++ b/bsp_q7s/core/CoreController.cpp @@ -33,12 +33,7 @@ xsc::Copy CoreController::CURRENT_COPY = xsc::Copy::NO_COPY; CoreController::CoreController(object_id_t objectId) : ExtendedControllerBase(objectId, 5), opDivider5(5), opDivider10(10), hkSet(this) { - ReturnValue_t result = returnvalue::OK; try { - result = initWatchdogFifo(); - if (result != returnvalue::OK) { - sif::warning << "CoreController::CoreController: Watchdog FIFO init failed" << std::endl; - } sdcMan = SdCardManager::instance(); if (sdcMan == nullptr) { sif::error << "CoreController::CoreController: SD card manager invalid!" << std::endl; @@ -47,11 +42,25 @@ CoreController::CoreController(object_id_t objectId) if (not BLOCKING_SD_INIT) { sdcMan->setBlocking(false); } + auto sdCard = sdcMan->getPreferredSdCard(); + if (not sdCard.has_value()) { + sif::error << "CoreController::initializeAfterTaskCreation: " + "Issues getting preferred SD card, setting to 0" + << std::endl; + sdCard = sd::SdCard::SLOT_0; + } + sdInfo.active = sdCard.value(); + sdcMan->setActiveSdCard(sdInfo.active); + currMntPrefix = sdcMan->getCurrentMountPrefix(); getCurrentBootCopy(CURRENT_CHIP, CURRENT_COPY); } catch (const std::filesystem::filesystem_error &e) { sif::error << "CoreController::CoreController: Failed with exception " << e.what() << std::endl; } + // Add script folder to path + char *currentEnvPath = getenv("PATH"); + std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin"; + setenv("PATH", updatedEnvPath.c_str(), true); sdCardCheckCd.timeOut(); eventQueue = QueueFactory::instance()->createMessageQueue(5, EventMessage::MAX_MESSAGE_SIZE); } @@ -76,7 +85,6 @@ void CoreController::performControlOperation() { } } } - performWatchdogControlOperation(); sdStateMachine(); performMountedSdCardOperations(); if (sdCardCheckCd.hasTimedOut()) { @@ -146,19 +154,6 @@ ReturnValue_t CoreController::initialize() { ReturnValue_t CoreController::initializeAfterTaskCreation() { ReturnValue_t result = returnvalue::OK; - auto sdCard = sdcMan->getPreferredSdCard(); - if (not sdCard) { - return returnvalue::FAILED; - } - sdInfo.active = sdCard.value(); - if (sdInfo.active == sd::SdCard::NONE) { - sif::error << "CoreController::initializeAfterTaskCreation: " - "Issues getting preferred SD card, setting to 0" - << std::endl; - sdInfo.active = sd::SdCard::SLOT_0; - } - sdcMan->setActiveSdCard(sdInfo.active); - currMntPrefix = sdcMan->getCurrentMountPrefix(); if (BLOCKING_SD_INIT) { result = initSdCardBlocking(); if (result != returnvalue::OK and result != SdCardManager::ALREADY_MOUNTED) { @@ -170,12 +165,7 @@ ReturnValue_t CoreController::initializeAfterTaskCreation() { if (result != returnvalue::OK) { sif::warning << "CoreController::initialize: Version initialization failed" << std::endl; } - // Add script folder to path - char *currentEnvPath = getenv("PATH"); - std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin"; - setenv("PATH", updatedEnvPath.c_str(), true); updateProtInfo(); - initPrint(); return ExtendedControllerBase::initializeAfterTaskCreation(); } @@ -839,36 +829,6 @@ void CoreController::getCurrentBootCopy(xsc::Chip &chip, xsc::Copy ©) { copy = static_cast(xscCopy); } -ReturnValue_t CoreController::initWatchdogFifo() { - if (not std::filesystem::exists(watchdog::FIFO_NAME)) { - // Still return returnvalue::OK for now - sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate" - << " watchdog" << std::endl; - return returnvalue::OK; - } - // Open FIFO write only and non-blocking to prevent SW from killing itself. - watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK); - if (watchdogFifoFd < 0) { - if (errno == ENXIO) { - watchdogFifoFd = RETRY_FIFO_OPEN; - sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl; - } else { - sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno - << ": " << strerror(errno) << std::endl; - return returnvalue::FAILED; - } - } - return returnvalue::OK; -} - -void CoreController::initPrint() { -#if OBSW_VERBOSE_LEVEL >= 1 - if (watchdogFifoFd > 0) { - sif::info << "Opened watchdog FIFO successfully.." << std::endl; - } -#endif -} - ReturnValue_t CoreController::actionXscReboot(const uint8_t *data, size_t size) { if (size < 1) { return HasActionsIF::INVALID_PARAMETERS; @@ -1223,36 +1183,6 @@ ReturnValue_t CoreController::handleProtInfoUpdateLine(std::string nextLine) { return returnvalue::OK; } -void CoreController::performWatchdogControlOperation() { - // Only perform each fifth iteration - if (watchdogFifoFd != 0 and opDivider5.check()) { - if (watchdogFifoFd == RETRY_FIFO_OPEN) { - // Open FIFO write only and non-blocking - watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK); - if (watchdogFifoFd < 0) { - if (errno == ENXIO) { - watchdogFifoFd = RETRY_FIFO_OPEN; - // No printout for now, would be spam - return; - } else { - sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " - << errno << ": " << strerror(errno) << std::endl; - return; - } - } - sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl; - } else if (watchdogFifoFd > 0) { - // Write to OBSW watchdog FIFO here - const char writeChar = 'a'; - ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1); - if (writtenBytes < 0) { - sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno) - << std::endl; - } - } - } -} - void CoreController::performMountedSdCardOperations() { auto mountedSdCardOp = [&](sd::SdCard sdCard, std::string mntPoint) { if (not performOneShotSdCardOpsSwitch) { diff --git a/bsp_q7s/core/CoreController.h b/bsp_q7s/core/CoreController.h index 65ee20ef..c90e5ee4 100644 --- a/bsp_q7s/core/CoreController.h +++ b/bsp_q7s/core/CoreController.h @@ -162,9 +162,6 @@ class CoreController : public ExtendedControllerBase { private: static constexpr MutexIF::TimeoutType TIMEOUT_TYPE = MutexIF::TimeoutType::WAITING; static constexpr uint32_t MUTEX_TIMEOUT = 20; - // Designated value for rechecking FIFO open - static constexpr int RETRY_FIFO_OPEN = -2; - int watchdogFifoFd = 0; GpsHyperion::FixMode gpsFix = GpsHyperion::FixMode::UNKNOWN; // States for SD state machine, which is used in non-blocking mode @@ -260,7 +257,6 @@ class CoreController : public ExtendedControllerBase { ReturnValue_t performSdCardCheck(); ReturnValue_t timeFileHandler(); ReturnValue_t initBootCopyFile(); - ReturnValue_t initWatchdogFifo(); ReturnValue_t initSdCardBlocking(); bool startSdStateMachine(sd::SdCard targetActiveSd, SdCfgMode mode, MessageQueueId_t commander, DeviceCommandId_t actionId); @@ -285,8 +281,6 @@ class CoreController : public ExtendedControllerBase { ReturnValue_t gracefulShutdownTasks(xsc::Chip chip, xsc::Copy copy, bool& protOpPerformed); - void performWatchdogControlOperation(); - ReturnValue_t handleProtInfoUpdateLine(std::string nextLine); int handleBootCopyProtAtIndex(xsc::Chip targetChip, xsc::Copy targetCopy, bool protect, bool& protOperationPerformed, bool selfChip, bool selfCopy, diff --git a/bsp_q7s/core/WatchdogHandler.cpp b/bsp_q7s/core/WatchdogHandler.cpp new file mode 100644 index 00000000..b5a9edc7 --- /dev/null +++ b/bsp_q7s/core/WatchdogHandler.cpp @@ -0,0 +1,84 @@ +#include "WatchdogHandler.h" + +#include +#include + +#include +#include +#include + +#include "fsfw/serviceinterface.h" +#include "watchdog/definitions.h" + +WatchdogHandler::WatchdogHandler() {} + +void WatchdogHandler::periodicOperation() { + if (watchdogFifoFd != 0) { + if (watchdogFifoFd == RETRY_FIFO_OPEN) { + // Open FIFO write only and non-blocking + watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK); + if (watchdogFifoFd < 0) { + if (errno == ENXIO) { + watchdogFifoFd = RETRY_FIFO_OPEN; + // No printout for now, would be spam + return; + } else { + sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " + << errno << ": " << strerror(errno) << std::endl; + return; + } + } + sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl; + performStartHandling(); + } else if (watchdogFifoFd > 0) { + // Write to OBSW watchdog FIFO here + const char writeChar = watchdog::first::IDLE_CHAR; + ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1); + if (writtenBytes < 0) { + sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno) + << std::endl; + } + } + } +} + +ReturnValue_t WatchdogHandler::initialize(bool enableWatchdogFunction) { + using namespace std::filesystem; + this->enableWatchFunction = enableWatchdogFunction; + if (not std::filesystem::exists(watchdog::FIFO_NAME)) { + // Still return returnvalue::OK for now + sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate" + << " watchdog" << std::endl; + return returnvalue::OK; + } + // Open FIFO write only and non-blocking to prevent SW from killing itself. + watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK); + if (watchdogFifoFd < 0) { + if (errno == ENXIO) { + watchdogFifoFd = RETRY_FIFO_OPEN; + sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl; + } else { + sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno + << ": " << strerror(errno) << std::endl; + return returnvalue::FAILED; + } + } + return performStartHandling(); +} + +ReturnValue_t WatchdogHandler::performStartHandling() { + char startBuf[2]; + size_t writeLen = 1; + startBuf[0] = watchdog::first::START_CHAR; + if (enableWatchFunction) { + writeLen += 1; + startBuf[1] = watchdog::second::WATCH_FLAG; + } + ssize_t writtenBytes = write(watchdogFifoFd, &startBuf, writeLen); + if (writtenBytes < 0) { + sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno) + << std::endl; + return returnvalue::FAILED; + } + return returnvalue::OK; +} diff --git a/bsp_q7s/core/WatchdogHandler.h b/bsp_q7s/core/WatchdogHandler.h new file mode 100644 index 00000000..5db42286 --- /dev/null +++ b/bsp_q7s/core/WatchdogHandler.h @@ -0,0 +1,23 @@ +#ifndef BSP_Q7S_CORE_WATCHDOGHANDLER_H_ +#define BSP_Q7S_CORE_WATCHDOGHANDLER_H_ + +#include "fsfw/returnvalues/returnvalue.h" + +class WatchdogHandler { + public: + WatchdogHandler(); + + ReturnValue_t initialize(bool enableWatchFunction); + void periodicOperation(); + + private: + // Designated value for rechecking FIFO open + static constexpr int RETRY_FIFO_OPEN = -2; + + int watchdogFifoFd = 0; + bool enableWatchFunction = false; + + ReturnValue_t performStartHandling(); +}; + +#endif /* BSP_Q7S_CORE_WATCHDOGHANDLER_H_ */ diff --git a/bsp_q7s/obsw.cpp b/bsp_q7s/obsw.cpp index cfd28565..cd69bdf5 100644 --- a/bsp_q7s/obsw.cpp +++ b/bsp_q7s/obsw.cpp @@ -9,6 +9,7 @@ #include #include "OBSWConfig.h" +#include "bsp_q7s/core/WatchdogHandler.h" #include "commonConfig.h" #include "core/scheduling.h" #include "fsfw/tasks/TaskFactory.h" @@ -24,6 +25,9 @@ static const char* DEV_STRING = "Xiphos Q7S FM"; #else static const char* DEV_STRING = "Xiphos Q7S EM"; #endif + +WatchdogHandler WATCHDOG_HANDLER; + int obsw::obsw() { using namespace fsfw; std::cout << "-- EIVE OBSW --" << std::endl; @@ -44,6 +48,35 @@ int obsw::obsw() { } #endif + // Delay the boot if applicable. + bootDelayHandling(); + + bool initWatchFunction = false; + if (std::filesystem::current_path() == "/usr/bin") { + initWatchFunction = true; + } + ReturnValue_t result = WATCHDOG_HANDLER.initialize(initWatchFunction); + if (result != returnvalue::OK) { + std::cerr << "Initiating EIVE watchdog handler failed" << std::endl; + } + + scheduling::initMission(); + + // Command the EIVE system to safe mode +#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1 + commandEiveSystemToSafe(); +#else + announceAllModes(); +#endif + + for (;;) { + WATCHDOG_HANDLER.periodicOperation(); + TaskFactory::delayTask(1000); + } + return 0; +} + +void obsw::bootDelayHandling() { const char* homedir = nullptr; homedir = getenv("HOME"); if (homedir == nullptr) { @@ -71,31 +104,26 @@ int obsw::obsw() { std::cout << "Delaying OBSW start for " << bootDelaySecs << " seconds" << std::endl; TaskFactory::delayTask(bootDelaySecs * 1000); } +} - scheduling::initMission(); - - // Command the EIVE system to safe mode +void obsw::commandEiveSystemToSafe() { auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue(); CommandMessage msg; -#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1 ModeMessage::setCmdModeMessage(msg, acs::AcsMode::SAFE, 0); ReturnValue_t result = MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false); if (result != returnvalue::OK) { sif::error << "Sending safe mode command to EIVE system failed" << std::endl; } -#else +} + +void obsw::announceAllModes() { + auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue(); + CommandMessage msg; ModeMessage::setModeAnnounceMessage(msg, true); ReturnValue_t result = MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false); if (result != returnvalue::OK) { sif::error << "Sending safe mode command to EIVE system failed" << std::endl; } -#endif - - for (;;) { - /* Suspend main thread by sleeping it. */ - TaskFactory::delayTask(5000); - } - return 0; } diff --git a/bsp_q7s/obsw.h b/bsp_q7s/obsw.h index c2d974ae..351925aa 100644 --- a/bsp_q7s/obsw.h +++ b/bsp_q7s/obsw.h @@ -5,6 +5,10 @@ namespace obsw { int obsw(); -}; +void bootDelayHandling(); +void commandEiveSystemToSafe(); +void announceAllModes(); + +}; // namespace obsw #endif /* BSP_Q7S_CORE_OBSW_H_ */ diff --git a/scripts/auto-formatter.sh b/scripts/auto-formatter.sh index 958ba0ac..1bea10a4 100755 --- a/scripts/auto-formatter.sh +++ b/scripts/auto-formatter.sh @@ -4,6 +4,7 @@ if [[ ! -f README.md ]]; then fi folder_list=( + "./watchdog" "./mission" "./linux" "./bsp_q7s" diff --git a/watchdog/CMakeLists.txt b/watchdog/CMakeLists.txt index ecb50627..f7c7330c 100644 --- a/watchdog/CMakeLists.txt +++ b/watchdog/CMakeLists.txt @@ -1,10 +1,5 @@ -target_sources(${WATCHDOG_NAME} PRIVATE - main.cpp - Watchdog.cpp -) +target_sources(${WATCHDOG_NAME} PRIVATE main.cpp Watchdog.cpp) -target_include_directories(${WATCHDOG_NAME} PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} -) +target_include_directories(${WATCHDOG_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) install(TARGETS ${WATCHDOG_NAME} RUNTIME DESTINATION bin) diff --git a/watchdog/Watchdog.cpp b/watchdog/Watchdog.cpp index a14bae53..21fdb31c 100644 --- a/watchdog/Watchdog.cpp +++ b/watchdog/Watchdog.cpp @@ -1,258 +1,276 @@ #include "Watchdog.h" -#include "definitions.h" #include -#include -#include -#include -#include #include +#include +#include +#include +#include -#include -#include -#include +#include #include #include +#include +#include +#include +#include "definitions.h" -WatchdogTask::WatchdogTask (): fd(0) { - int result = 0; - // Only create the FIFO if it does not exist yet - if(not std::filesystem::exists(watchdog::FIFO_NAME)) { - // Permission 666 or rw-rw-rw- - mode_t mode = DEFFILEMODE; - result = mkfifo(watchdog::FIFO_NAME.c_str(), mode); - if(result != 0) { - std::cerr << "eive-watchdog: Could not created named pipe at " << - watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) << - std::endl; - throw std::runtime_error("eive-watchdog: FIFO creation failed"); - } -#if WATCHDOG_VERBOSE_LEVEL >= 1 - std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << - " created successfully" << std::endl; -#endif +WatchdogTask::WatchdogTask() : fd(0) { + int result = 0; + // Only create the FIFO if it does not exist yet + if (not std::filesystem::exists(watchdog::FIFO_NAME)) { + // Permission 666 or rw-rw-rw- + mode_t mode = DEFFILEMODE; + result = mkfifo(watchdog::FIFO_NAME.c_str(), mode); + if (result != 0) { + std::cerr << "eive-watchdog: Could not created named pipe at " << watchdog::FIFO_NAME + << ", error " << errno << ": " << strerror(errno) << std::endl; + throw std::runtime_error("eive-watchdog: FIFO creation failed"); } +#if WATCHDOG_VERBOSE_LEVEL >= 1 + std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << " created successfully" + << std::endl; +#endif + } } -WatchdogTask::~WatchdogTask() { - -} +WatchdogTask::~WatchdogTask() {} int WatchdogTask::performOperation() { - // Open FIFO read only and non-blocking - fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK); - if(fd < 0) { - std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << - "read-only failed with " << errno << ": " << strerror(errno) << std::endl; - return -1; - } - state = States::RUNNING; + // Open FIFO read only and non-blocking + fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK); + if (fd < 0) { + std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << "read-only failed with " + << errno << ": " << strerror(errno) << std::endl; + return -1; + } + state = States::NOT_STARTED; - while(true) { - WatchdogTask::LoopResult loopResult = watchdogLoop(); - switch(loopResult) { - case(LoopResult::OK): { - performRunningOperation(); - break; - } - case(LoopResult::CANCEL_RQ): { - std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl; - return 0; - } - case(LoopResult::SUSPEND_RQ): { - performSuspendOperation(); - break; - } - case(LoopResult::TIMEOUT): { - performNotRunningOperation(loopResult); - break; - } - case(LoopResult::HUNG_UP): { - performNotRunningOperation(loopResult); - break; - } - case(LoopResult::RESTART_RQ): { - if(state == States::SUSPENDED or state == States::FAULTY) { - performRunningOperation(); - } - break; - } - case(LoopResult::FAULT): { - using namespace std::chrono_literals; - // Configuration error - std::cerr << "Fault has occured in watchdog loop" << std::endl; - // Prevent spam - std::this_thread::sleep_for(2000ms); - - } - } + while (true) { + WatchdogTask::LoopResult loopResult = watchdogLoop(); + if (not stateMachine(loopResult)) { + break; } - if (close(fd) < 0) { - std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << - "failed, error " << errno << ": " << strerror(errno) << std::endl; - } - std::cout << "eive-watchdog: Finished" << std::endl; - return 0; + } + if (close(fd) < 0) { + std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << "failed, error " + << errno << ": " << strerror(errno) << std::endl; + } + std::cout << "eive-watchdog: Finished" << std::endl; + return 0; } WatchdogTask::LoopResult WatchdogTask::watchdogLoop() { - using namespace std::chrono_literals; - struct pollfd waiter = {}; - waiter.fd = fd; - waiter.events = POLLIN; + using namespace std::chrono_literals; + struct pollfd waiter = {}; + waiter.fd = fd; + waiter.events = POLLIN; - switch(state) { - case(States::SUSPENDED): { - // Sleep, then check whether a restart request was received - std::this_thread::sleep_for(1000ms); - break; + // Only poll one file descriptor with timeout + switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) { + case (0): { + return LoopResult::TIMEOUT; } - case(States::RUNNING): { - // Continue as usual - break; - } - case(States::NOT_STARTED): { - // This should not happen - std::cerr << "eive-watchdog: State is NOT_STARTED, configuration error" << std::endl; - break; - } - case(States::FAULTY): { - // TODO: Not sure what to do yet. Continue for now - break; - } - } - - // 10 seconds timeout, only poll one file descriptor - switch(poll(&waiter, 1, watchdog::TIMEOUT_MS)) { - case(0): { - return LoopResult::TIMEOUT; - } - case(1): { - return pollEvent(waiter); + case (1): { + return pollEvent(waiter); } default: { - std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error " << - errno << ": " << strerror(errno) << std::endl; - break; + std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error " + << errno << ": " << strerror(errno) << std::endl; + break; } - } - return LoopResult::OK; + } + return LoopResult::OK; } WatchdogTask::LoopResult WatchdogTask::pollEvent(struct pollfd& waiter) { - if (waiter.revents & POLLIN) { - ssize_t readLen = read(fd, buf.data(), buf.size()); - if (readLen < 0) { - std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << - ", error " << errno << ": " << strerror(errno) << std::endl; - return LoopResult::OK; - } + if (waiter.revents & POLLIN) { + ssize_t readLen = read(fd, buf.data(), buf.size()); + if (readLen < 0) { + std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << ", error " + << errno << ": " << strerror(errno) << std::endl; + return LoopResult::OK; + } #if WATCHDOG_VERBOSE_LEVEL == 2 - std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME - << std::endl; + std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME << std::endl; #endif - else if(readLen >= 1) { - return parseCommandByte(readLen); - } + else if (readLen >= 1) { + return parseCommand(readLen); + } - } - else if(waiter.revents & POLLERR) { - std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME << - std::endl; - return LoopResult::FAULT; - } - else if (waiter.revents & POLLHUP) { - // Writer closed its end - return LoopResult::HUNG_UP; - } + } else if (waiter.revents & POLLERR) { + std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME << std::endl; return LoopResult::FAULT; + } else if (waiter.revents & POLLHUP) { + // Writer closed its end + return LoopResult::HUNG_UP; + } + return LoopResult::FAULT; } -WatchdogTask::LoopResult WatchdogTask::parseCommandByte(ssize_t readLen) { - for(ssize_t idx = 0; idx < readLen; idx++) { - char readChar = buf[idx]; - // Cancel request - if(readChar == watchdog::CANCEL_CHAR) { - return LoopResult::CANCEL_RQ; - } - // Begin request. Does not work if the operation was not suspended before - else if(readChar == watchdog::RESTART_CHAR) { - return LoopResult::RESTART_RQ; - } - // Suspend request - else if(readChar == watchdog::SUSPEND_CHAR) { - return LoopResult::SUSPEND_RQ; - } - // Everything else: All working as expected +WatchdogTask::LoopResult WatchdogTask::parseCommand(ssize_t readLen) { + char readChar = buf[0]; + // Cancel request + if (readChar == watchdog::first::CANCEL_CHAR) { + return LoopResult::CANCEL_REQ; + } else if (readChar == watchdog::first::SUSPEND_CHAR) { + // Suspend request + return LoopResult::SUSPEND_REQ; + } else if (readChar == watchdog::first::START_CHAR) { + if (readLen == 2 and static_cast(buf[1]) == watchdog::second::WATCH_FLAG) { + return LoopResult::START_WITH_WATCH_REQ; } - return LoopResult::OK; + return LoopResult::START_REQ; + } + // Everything else: All working as expected + return LoopResult::OK; } int WatchdogTask::performRunningOperation() { - if(state != States::RUNNING) { - state = States::RUNNING; + if (state != States::RUNNING) { + state = States::RUNNING; + } + if (notRunningStart.has_value()) { + notRunningStart = std::nullopt; + } + + if (not obswRunning) { + if (printNotRunningLatch) { + // Reset latch so user can see timeouts + printNotRunningLatch = false; } - if(not obswRunning) { - if(printNotRunningLatch) { - // Reset latch so user can see timeouts - printNotRunningLatch = false; - } - - obswRunning = true; - std::cout << "eive-watchdog: Running OBSW detected.." << std::endl; + obswRunning = true; + std::cout << "eive-watchdog: Running OBSW detected.." << std::endl; #if WATCHDOG_CREATE_FILE_IF_RUNNING == 1 - std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl; - if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) { - std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME); - if(not obswRunningFile.good()) { - std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" - << std::endl; - } - } -#endif + std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl; + if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) { + std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME); + if (not obswRunningFile.good()) { + std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl; + } } - return 0; +#endif + } + return 0; } int WatchdogTask::performNotRunningOperation(LoopResult type) { - // Latch prevents spam on console - if(not printNotRunningLatch) { - if(type == LoopResult::HUNG_UP) { - std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl; - } - else { - std::cout << "eive-watchdog: The FIFO timed out!" << std::endl; - } - printNotRunningLatch = true; + // Latch prevents spam on console + if (not printNotRunningLatch) { + if (type == LoopResult::HUNG_UP) { + std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl; + } else { + std::cout << "eive-watchdog: The FIFO timed out!" << std::endl; } + printNotRunningLatch = true; + } - if(obswRunning) { + if (not notRunningStart.has_value()) { + notRunningStart = std::chrono::system_clock::now(); + } + + if (obswRunning) { #if WATCHDOG_CREATE_FILE_IF_RUNNING == 1 - if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) { - int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str()); - if(result != 0) { - std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << - errno << ": " << strerror(errno) << std::endl; - } - } + if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) { + int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str()); + if (result != 0) { + std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno + << ": " << strerror(errno) << std::endl; + } + } #endif - obswRunning = false; + obswRunning = false; + } + + if (watchingObsw) { + auto timeNotRunning = std::chrono::system_clock::now() - notRunningStart.value(); + if (std::chrono::duration_cast(timeNotRunning).count() > + watchdog::MAX_NOT_RUNNING_MS) { + std::cout << "Restarting OBSW" << std::endl; + std::system("systemctl restart obsw"); } - if(type == LoopResult::HUNG_UP) { - using namespace std::chrono_literals; - // Prevent spam - std::this_thread::sleep_for(2000ms); - } - return 0; + } + if (type == LoopResult::HUNG_UP) { + using namespace std::chrono_literals; + // Prevent spam + std::this_thread::sleep_for(2000ms); + } + return 0; } -int WatchdogTask::performSuspendOperation() { - if(state == States::RUNNING or state == States::FAULTY) { - std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl; - watchdogRunning = false; - state = States::SUSPENDED; +bool WatchdogTask::stateMachine(LoopResult loopResult) { + using namespace std::chrono_literals; + bool sleep = false; + switch (state) { + case (States::RUNNING): { + switch (loopResult) { + case (LoopResult::TIMEOUT): + case (LoopResult::HUNG_UP): { + performNotRunningOperation(loopResult); + break; + } + case (LoopResult::OK): { + performRunningOperation(); + break; + } + case (LoopResult::SUSPEND_REQ): { + if (state == States::RUNNING or state == States::FAULTY) { + std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl; + state = States::SUSPENDED; + } + performSuspendOperation(); + sleep = true; + break; + } + case (LoopResult::CANCEL_REQ): { + std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl; + return false; + } + } } - return 0; + case (States::FAULTY): + case (States::SUSPENDED): + case (States::NOT_STARTED): { + switch (loopResult) { + case (LoopResult::SUSPEND_REQ): { + // Ignore and also delay + sleep = true; + break; + } + case (LoopResult::START_REQ): + case (LoopResult::START_WITH_WATCH_REQ): { + if (state == States::NOT_STARTED or state == States::FAULTY) { + state = States::RUNNING; + } + std::cout << "Watch request received. Restarting OBSW if not running for " + << watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl; + if (loopResult == LoopResult::START_REQ) { + watchingObsw = false; + } else if (loopResult == LoopResult::START_WITH_WATCH_REQ) { + watchingObsw = true; + } + performRunningOperation(); + break; + } + default: { + sleep = true; + } + } + break; + } + } + if (loopResult == LoopResult::FAULT) { + // Configuration error + std::cerr << "Fault has occured in watchdog loop" << std::endl; + // Prevent spam + sleep = true; + } + if (sleep) { + std::this_thread::sleep_for(1000ms); + } + return true; } + +int WatchdogTask::performSuspendOperation() { return 0; } diff --git a/watchdog/Watchdog.h b/watchdog/Watchdog.h index 5745c033..524675a9 100644 --- a/watchdog/Watchdog.h +++ b/watchdog/Watchdog.h @@ -2,49 +2,52 @@ #define WATCHDOG_WATCHDOG_H_ #include +#include #include +#include #include class WatchdogTask { -public: - enum class States { - NOT_STARTED, - RUNNING, - SUSPENDED, - FAULTY - }; + public: + enum class States { NOT_STARTED, RUNNING, SUSPENDED, FAULTY }; - enum class LoopResult { - OK, - SUSPEND_RQ, - CANCEL_RQ, - RESTART_RQ, - TIMEOUT, - HUNG_UP, - FAULT - }; + enum class LoopResult { + OK, + START_REQ, + START_WITH_WATCH_REQ, + SUSPEND_REQ, + CANCEL_REQ, + TIMEOUT, + HUNG_UP, + FAULT + }; - WatchdogTask(); + WatchdogTask(); - virtual ~WatchdogTask(); + virtual ~WatchdogTask(); - int performOperation(); -private: - int fd = 0; + int performOperation(); - bool obswRunning = false; - bool watchdogRunning = false; - bool printNotRunningLatch = false; - std::array buf; - States state = States::NOT_STARTED; + private: + int fd = 0; - LoopResult watchdogLoop(); - LoopResult pollEvent(struct pollfd& waiter); - LoopResult parseCommandByte(ssize_t readLen); + bool obswRunning = false; + bool watchingObsw = false; + bool printNotRunningLatch = false; + std::array buf; + std::optional> notRunningStart; + States state = States::NOT_STARTED; - int performRunningOperation(); - int performNotRunningOperation(LoopResult type); - int performSuspendOperation(); + // Primary loop. Takes care of delaying, and reading from the communication pipe and translating + // messages to loop results. + LoopResult watchdogLoop(); + bool stateMachine(LoopResult result); + LoopResult pollEvent(struct pollfd& waiter); + LoopResult parseCommand(ssize_t readLen); + + int performRunningOperation(); + int performNotRunningOperation(LoopResult type); + int performSuspendOperation(); }; #endif /* WATCHDOG_WATCHDOG_H_ */ diff --git a/watchdog/definitions.h b/watchdog/definitions.h index bfb1ec13..5b68023a 100644 --- a/watchdog/definitions.h +++ b/watchdog/definitions.h @@ -5,17 +5,31 @@ namespace watchdog { +namespace first { + +// Start or restart character +static constexpr char START_CHAR = 'b'; // Suspend watchdog operations temporarily static constexpr char SUSPEND_CHAR = 's'; -// Resume watchdog operations -static constexpr char RESTART_CHAR = 'b'; // Causes the watchdog to close down static constexpr char CANCEL_CHAR = 'c'; +static constexpr char IDLE_CHAR = 'i'; + +} // namespace first + +namespace second { + +// Supplied with the start character. This will instruct the watchdog to actually watch +// the OBSW is runnng all the time. +static constexpr char WATCH_FLAG = 'w'; +} // namespace second static constexpr int TIMEOUT_MS = 5 * 1000; +// 2 minutes +static constexpr unsigned MAX_NOT_RUNNING_MS = 2 * 60 * 1000; const std::string FIFO_NAME = "/tmp/watchdog-pipe"; const std::string RUNNING_FILE_NAME = "/tmp/obsw-running"; -} +} // namespace watchdog #endif /* WATCHDOG_DEFINITIONS_H_ */ diff --git a/watchdog/main.cpp b/watchdog/main.cpp index ba75dc30..69f75623 100644 --- a/watchdog/main.cpp +++ b/watchdog/main.cpp @@ -1,24 +1,22 @@ -#include "Watchdog.h" - #include +#include "Watchdog.h" + /** * @brief This watchdog application uses a FIFO to check whether the OBSW is still running. * It checks whether the OBSW writes to the the FIFO regularly. */ int main() { - std::cout << "eive-watchdog: Starting OBSW watchdog.." << std::endl; - try { - WatchdogTask watchdogTask; - int result = watchdogTask.performOperation(); - if(result != 0) { - return result; - } + std::cout << "eive-watchdog: Starting OBSW watchdog.." << std::endl; + try { + WatchdogTask watchdogTask; + int result = watchdogTask.performOperation(); + if (result != 0) { + return result; } - catch(const std::runtime_error& e) { - std::cerr << "eive-watchdog: Run time exception " << e.what() << std::endl; - return -1; - } - return 0; + } catch (const std::runtime_error& e) { + std::cerr << "eive-watchdog: Run time exception " << e.what() << std::endl; + return -1; + } + return 0; } -