reworked watchdog

This commit is contained in:
Robin Müller 2023-02-23 23:56:11 +01:00
parent e33a0fd60b
commit f789380343
No known key found for this signature in database
GPG Key ID: 11D4952C8CCEF814
14 changed files with 470 additions and 366 deletions

View File

@ -16,6 +16,18 @@ will consitute of a breaking change warranting a new major release:
# [unreleased] # [unreleased]
## Changed
- Improved the OBSW watchdog by adding a watch functionality. The watch functionality is optional
and has to be enabled specifically by the application being watched by the watchdog when
starting the watchdog. If the watch functionality is enabled and the OBSW has not pinged
the watchdog via the FIFO for 2 minutes, the watchdog will restart the OBSW service via systemd.
The primary OBSW will only activate the watch functionality if it is the OBSW inside the
`/usr/bin` directory. This allows debugging the system by leaving flashed or manually copied
debugging images 2 minutes to start the watchdog without the watch functionality.
- The SD card prefix is now set earlier inside the `CoreController` constructor
- The watchdog handling was moved outside the `CoreController` into the main loop.
# [v1.31.1] # [v1.31.1]
## Fixed ## Fixed

View File

@ -1,4 +1,4 @@
target_sources(${OBSW_NAME} PRIVATE CoreController.cpp scheduling.cpp target_sources(${OBSW_NAME} PRIVATE CoreController.cpp scheduling.cpp
ObjectFactory.cpp) ObjectFactory.cpp WatchdogHandler.cpp)
target_sources(${SIMPLE_OBSW_NAME} PRIVATE scheduling.cpp) target_sources(${SIMPLE_OBSW_NAME} PRIVATE scheduling.cpp)

View File

@ -33,12 +33,7 @@ xsc::Copy CoreController::CURRENT_COPY = xsc::Copy::NO_COPY;
CoreController::CoreController(object_id_t objectId) CoreController::CoreController(object_id_t objectId)
: ExtendedControllerBase(objectId, 5), opDivider5(5), opDivider10(10), hkSet(this) { : ExtendedControllerBase(objectId, 5), opDivider5(5), opDivider10(10), hkSet(this) {
ReturnValue_t result = returnvalue::OK;
try { try {
result = initWatchdogFifo();
if (result != returnvalue::OK) {
sif::warning << "CoreController::CoreController: Watchdog FIFO init failed" << std::endl;
}
sdcMan = SdCardManager::instance(); sdcMan = SdCardManager::instance();
if (sdcMan == nullptr) { if (sdcMan == nullptr) {
sif::error << "CoreController::CoreController: SD card manager invalid!" << std::endl; sif::error << "CoreController::CoreController: SD card manager invalid!" << std::endl;
@ -47,11 +42,25 @@ CoreController::CoreController(object_id_t objectId)
if (not BLOCKING_SD_INIT) { if (not BLOCKING_SD_INIT) {
sdcMan->setBlocking(false); sdcMan->setBlocking(false);
} }
auto sdCard = sdcMan->getPreferredSdCard();
if (not sdCard.has_value()) {
sif::error << "CoreController::initializeAfterTaskCreation: "
"Issues getting preferred SD card, setting to 0"
<< std::endl;
sdCard = sd::SdCard::SLOT_0;
}
sdInfo.active = sdCard.value();
sdcMan->setActiveSdCard(sdInfo.active);
currMntPrefix = sdcMan->getCurrentMountPrefix();
getCurrentBootCopy(CURRENT_CHIP, CURRENT_COPY); getCurrentBootCopy(CURRENT_CHIP, CURRENT_COPY);
} catch (const std::filesystem::filesystem_error &e) { } catch (const std::filesystem::filesystem_error &e) {
sif::error << "CoreController::CoreController: Failed with exception " << e.what() << std::endl; sif::error << "CoreController::CoreController: Failed with exception " << e.what() << std::endl;
} }
// Add script folder to path
char *currentEnvPath = getenv("PATH");
std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin";
setenv("PATH", updatedEnvPath.c_str(), true);
sdCardCheckCd.timeOut(); sdCardCheckCd.timeOut();
eventQueue = QueueFactory::instance()->createMessageQueue(5, EventMessage::MAX_MESSAGE_SIZE); eventQueue = QueueFactory::instance()->createMessageQueue(5, EventMessage::MAX_MESSAGE_SIZE);
} }
@ -76,7 +85,6 @@ void CoreController::performControlOperation() {
} }
} }
} }
performWatchdogControlOperation();
sdStateMachine(); sdStateMachine();
performMountedSdCardOperations(); performMountedSdCardOperations();
if (sdCardCheckCd.hasTimedOut()) { if (sdCardCheckCd.hasTimedOut()) {
@ -146,19 +154,6 @@ ReturnValue_t CoreController::initialize() {
ReturnValue_t CoreController::initializeAfterTaskCreation() { ReturnValue_t CoreController::initializeAfterTaskCreation() {
ReturnValue_t result = returnvalue::OK; ReturnValue_t result = returnvalue::OK;
auto sdCard = sdcMan->getPreferredSdCard();
if (not sdCard) {
return returnvalue::FAILED;
}
sdInfo.active = sdCard.value();
if (sdInfo.active == sd::SdCard::NONE) {
sif::error << "CoreController::initializeAfterTaskCreation: "
"Issues getting preferred SD card, setting to 0"
<< std::endl;
sdInfo.active = sd::SdCard::SLOT_0;
}
sdcMan->setActiveSdCard(sdInfo.active);
currMntPrefix = sdcMan->getCurrentMountPrefix();
if (BLOCKING_SD_INIT) { if (BLOCKING_SD_INIT) {
result = initSdCardBlocking(); result = initSdCardBlocking();
if (result != returnvalue::OK and result != SdCardManager::ALREADY_MOUNTED) { if (result != returnvalue::OK and result != SdCardManager::ALREADY_MOUNTED) {
@ -170,12 +165,7 @@ ReturnValue_t CoreController::initializeAfterTaskCreation() {
if (result != returnvalue::OK) { if (result != returnvalue::OK) {
sif::warning << "CoreController::initialize: Version initialization failed" << std::endl; sif::warning << "CoreController::initialize: Version initialization failed" << std::endl;
} }
// Add script folder to path
char *currentEnvPath = getenv("PATH");
std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin";
setenv("PATH", updatedEnvPath.c_str(), true);
updateProtInfo(); updateProtInfo();
initPrint();
return ExtendedControllerBase::initializeAfterTaskCreation(); return ExtendedControllerBase::initializeAfterTaskCreation();
} }
@ -839,36 +829,6 @@ void CoreController::getCurrentBootCopy(xsc::Chip &chip, xsc::Copy &copy) {
copy = static_cast<xsc::Copy>(xscCopy); copy = static_cast<xsc::Copy>(xscCopy);
} }
ReturnValue_t CoreController::initWatchdogFifo() {
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
// Still return returnvalue::OK for now
sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate"
<< " watchdog" << std::endl;
return returnvalue::OK;
}
// Open FIFO write only and non-blocking to prevent SW from killing itself.
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno
<< ": " << strerror(errno) << std::endl;
return returnvalue::FAILED;
}
}
return returnvalue::OK;
}
void CoreController::initPrint() {
#if OBSW_VERBOSE_LEVEL >= 1
if (watchdogFifoFd > 0) {
sif::info << "Opened watchdog FIFO successfully.." << std::endl;
}
#endif
}
ReturnValue_t CoreController::actionXscReboot(const uint8_t *data, size_t size) { ReturnValue_t CoreController::actionXscReboot(const uint8_t *data, size_t size) {
if (size < 1) { if (size < 1) {
return HasActionsIF::INVALID_PARAMETERS; return HasActionsIF::INVALID_PARAMETERS;
@ -1223,36 +1183,6 @@ ReturnValue_t CoreController::handleProtInfoUpdateLine(std::string nextLine) {
return returnvalue::OK; return returnvalue::OK;
} }
void CoreController::performWatchdogControlOperation() {
// Only perform each fifth iteration
if (watchdogFifoFd != 0 and opDivider5.check()) {
if (watchdogFifoFd == RETRY_FIFO_OPEN) {
// Open FIFO write only and non-blocking
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
// No printout for now, would be spam
return;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with "
<< errno << ": " << strerror(errno) << std::endl;
return;
}
}
sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl;
} else if (watchdogFifoFd > 0) {
// Write to OBSW watchdog FIFO here
const char writeChar = 'a';
ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1);
if (writtenBytes < 0) {
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
<< std::endl;
}
}
}
}
void CoreController::performMountedSdCardOperations() { void CoreController::performMountedSdCardOperations() {
auto mountedSdCardOp = [&](sd::SdCard sdCard, std::string mntPoint) { auto mountedSdCardOp = [&](sd::SdCard sdCard, std::string mntPoint) {
if (not performOneShotSdCardOpsSwitch) { if (not performOneShotSdCardOpsSwitch) {

View File

@ -162,9 +162,6 @@ class CoreController : public ExtendedControllerBase {
private: private:
static constexpr MutexIF::TimeoutType TIMEOUT_TYPE = MutexIF::TimeoutType::WAITING; static constexpr MutexIF::TimeoutType TIMEOUT_TYPE = MutexIF::TimeoutType::WAITING;
static constexpr uint32_t MUTEX_TIMEOUT = 20; static constexpr uint32_t MUTEX_TIMEOUT = 20;
// Designated value for rechecking FIFO open
static constexpr int RETRY_FIFO_OPEN = -2;
int watchdogFifoFd = 0;
GpsHyperion::FixMode gpsFix = GpsHyperion::FixMode::UNKNOWN; GpsHyperion::FixMode gpsFix = GpsHyperion::FixMode::UNKNOWN;
// States for SD state machine, which is used in non-blocking mode // States for SD state machine, which is used in non-blocking mode
@ -260,7 +257,6 @@ class CoreController : public ExtendedControllerBase {
ReturnValue_t performSdCardCheck(); ReturnValue_t performSdCardCheck();
ReturnValue_t timeFileHandler(); ReturnValue_t timeFileHandler();
ReturnValue_t initBootCopyFile(); ReturnValue_t initBootCopyFile();
ReturnValue_t initWatchdogFifo();
ReturnValue_t initSdCardBlocking(); ReturnValue_t initSdCardBlocking();
bool startSdStateMachine(sd::SdCard targetActiveSd, SdCfgMode mode, MessageQueueId_t commander, bool startSdStateMachine(sd::SdCard targetActiveSd, SdCfgMode mode, MessageQueueId_t commander,
DeviceCommandId_t actionId); DeviceCommandId_t actionId);
@ -285,8 +281,6 @@ class CoreController : public ExtendedControllerBase {
ReturnValue_t gracefulShutdownTasks(xsc::Chip chip, xsc::Copy copy, bool& protOpPerformed); ReturnValue_t gracefulShutdownTasks(xsc::Chip chip, xsc::Copy copy, bool& protOpPerformed);
void performWatchdogControlOperation();
ReturnValue_t handleProtInfoUpdateLine(std::string nextLine); ReturnValue_t handleProtInfoUpdateLine(std::string nextLine);
int handleBootCopyProtAtIndex(xsc::Chip targetChip, xsc::Copy targetCopy, bool protect, int handleBootCopyProtAtIndex(xsc::Chip targetChip, xsc::Copy targetCopy, bool protect,
bool& protOperationPerformed, bool selfChip, bool selfCopy, bool& protOperationPerformed, bool selfChip, bool selfCopy,

View File

@ -0,0 +1,84 @@
#include "WatchdogHandler.h"
#include <fcntl.h>
#include <unistd.h>
#include <cerrno>
#include <cstring>
#include <filesystem>
#include "fsfw/serviceinterface.h"
#include "watchdog/definitions.h"
WatchdogHandler::WatchdogHandler() {}
void WatchdogHandler::periodicOperation() {
if (watchdogFifoFd != 0) {
if (watchdogFifoFd == RETRY_FIFO_OPEN) {
// Open FIFO write only and non-blocking
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
// No printout for now, would be spam
return;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with "
<< errno << ": " << strerror(errno) << std::endl;
return;
}
}
sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl;
performStartHandling();
} else if (watchdogFifoFd > 0) {
// Write to OBSW watchdog FIFO here
const char writeChar = watchdog::first::IDLE_CHAR;
ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1);
if (writtenBytes < 0) {
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
<< std::endl;
}
}
}
}
ReturnValue_t WatchdogHandler::initialize(bool enableWatchdogFunction) {
using namespace std::filesystem;
this->enableWatchFunction = enableWatchdogFunction;
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
// Still return returnvalue::OK for now
sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate"
<< " watchdog" << std::endl;
return returnvalue::OK;
}
// Open FIFO write only and non-blocking to prevent SW from killing itself.
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
if (watchdogFifoFd < 0) {
if (errno == ENXIO) {
watchdogFifoFd = RETRY_FIFO_OPEN;
sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl;
} else {
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno
<< ": " << strerror(errno) << std::endl;
return returnvalue::FAILED;
}
}
return performStartHandling();
}
ReturnValue_t WatchdogHandler::performStartHandling() {
char startBuf[2];
size_t writeLen = 1;
startBuf[0] = watchdog::first::START_CHAR;
if (enableWatchFunction) {
writeLen += 1;
startBuf[1] = watchdog::second::WATCH_FLAG;
}
ssize_t writtenBytes = write(watchdogFifoFd, &startBuf, writeLen);
if (writtenBytes < 0) {
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
<< std::endl;
return returnvalue::FAILED;
}
return returnvalue::OK;
}

View File

@ -0,0 +1,23 @@
#ifndef BSP_Q7S_CORE_WATCHDOGHANDLER_H_
#define BSP_Q7S_CORE_WATCHDOGHANDLER_H_
#include "fsfw/returnvalues/returnvalue.h"
class WatchdogHandler {
public:
WatchdogHandler();
ReturnValue_t initialize(bool enableWatchFunction);
void periodicOperation();
private:
// Designated value for rechecking FIFO open
static constexpr int RETRY_FIFO_OPEN = -2;
int watchdogFifoFd = 0;
bool enableWatchFunction = false;
ReturnValue_t performStartHandling();
};
#endif /* BSP_Q7S_CORE_WATCHDOGHANDLER_H_ */

View File

@ -9,6 +9,7 @@
#include <iostream> #include <iostream>
#include "OBSWConfig.h" #include "OBSWConfig.h"
#include "bsp_q7s/core/WatchdogHandler.h"
#include "commonConfig.h" #include "commonConfig.h"
#include "core/scheduling.h" #include "core/scheduling.h"
#include "fsfw/tasks/TaskFactory.h" #include "fsfw/tasks/TaskFactory.h"
@ -24,6 +25,9 @@ static const char* DEV_STRING = "Xiphos Q7S FM";
#else #else
static const char* DEV_STRING = "Xiphos Q7S EM"; static const char* DEV_STRING = "Xiphos Q7S EM";
#endif #endif
WatchdogHandler WATCHDOG_HANDLER;
int obsw::obsw() { int obsw::obsw() {
using namespace fsfw; using namespace fsfw;
std::cout << "-- EIVE OBSW --" << std::endl; std::cout << "-- EIVE OBSW --" << std::endl;
@ -44,6 +48,35 @@ int obsw::obsw() {
} }
#endif #endif
// Delay the boot if applicable.
bootDelayHandling();
bool initWatchFunction = false;
if (std::filesystem::current_path() == "/usr/bin") {
initWatchFunction = true;
}
ReturnValue_t result = WATCHDOG_HANDLER.initialize(initWatchFunction);
if (result != returnvalue::OK) {
std::cerr << "Initiating EIVE watchdog handler failed" << std::endl;
}
scheduling::initMission();
// Command the EIVE system to safe mode
#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1
commandEiveSystemToSafe();
#else
announceAllModes();
#endif
for (;;) {
WATCHDOG_HANDLER.periodicOperation();
TaskFactory::delayTask(1000);
}
return 0;
}
void obsw::bootDelayHandling() {
const char* homedir = nullptr; const char* homedir = nullptr;
homedir = getenv("HOME"); homedir = getenv("HOME");
if (homedir == nullptr) { if (homedir == nullptr) {
@ -71,31 +104,26 @@ int obsw::obsw() {
std::cout << "Delaying OBSW start for " << bootDelaySecs << " seconds" << std::endl; std::cout << "Delaying OBSW start for " << bootDelaySecs << " seconds" << std::endl;
TaskFactory::delayTask(bootDelaySecs * 1000); TaskFactory::delayTask(bootDelaySecs * 1000);
} }
}
scheduling::initMission(); void obsw::commandEiveSystemToSafe() {
// Command the EIVE system to safe mode
auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue(); auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue();
CommandMessage msg; CommandMessage msg;
#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1
ModeMessage::setCmdModeMessage(msg, acs::AcsMode::SAFE, 0); ModeMessage::setCmdModeMessage(msg, acs::AcsMode::SAFE, 0);
ReturnValue_t result = ReturnValue_t result =
MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false); MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false);
if (result != returnvalue::OK) { if (result != returnvalue::OK) {
sif::error << "Sending safe mode command to EIVE system failed" << std::endl; sif::error << "Sending safe mode command to EIVE system failed" << std::endl;
} }
#else }
void obsw::announceAllModes() {
auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue();
CommandMessage msg;
ModeMessage::setModeAnnounceMessage(msg, true); ModeMessage::setModeAnnounceMessage(msg, true);
ReturnValue_t result = ReturnValue_t result =
MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false); MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false);
if (result != returnvalue::OK) { if (result != returnvalue::OK) {
sif::error << "Sending safe mode command to EIVE system failed" << std::endl; sif::error << "Sending safe mode command to EIVE system failed" << std::endl;
} }
#endif
for (;;) {
/* Suspend main thread by sleeping it. */
TaskFactory::delayTask(5000);
}
return 0;
} }

View File

@ -5,6 +5,10 @@ namespace obsw {
int obsw(); int obsw();
}; void bootDelayHandling();
void commandEiveSystemToSafe();
void announceAllModes();
}; // namespace obsw
#endif /* BSP_Q7S_CORE_OBSW_H_ */ #endif /* BSP_Q7S_CORE_OBSW_H_ */

View File

@ -4,6 +4,7 @@ if [[ ! -f README.md ]]; then
fi fi
folder_list=( folder_list=(
"./watchdog"
"./mission" "./mission"
"./linux" "./linux"
"./bsp_q7s" "./bsp_q7s"

View File

@ -1,10 +1,5 @@
target_sources(${WATCHDOG_NAME} PRIVATE target_sources(${WATCHDOG_NAME} PRIVATE main.cpp Watchdog.cpp)
main.cpp
Watchdog.cpp
)
target_include_directories(${WATCHDOG_NAME} PRIVATE target_include_directories(${WATCHDOG_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
${CMAKE_CURRENT_SOURCE_DIR}
)
install(TARGETS ${WATCHDOG_NAME} RUNTIME DESTINATION bin) install(TARGETS ${WATCHDOG_NAME} RUNTIME DESTINATION bin)

View File

@ -1,96 +1,61 @@
#include "Watchdog.h" #include "Watchdog.h"
#include "definitions.h"
#include <errno.h> #include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <poll.h>
#include <unistd.h>
#include <fcntl.h> #include <fcntl.h>
#include <poll.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <iostream> #include <cstdlib>
#include <fstream>
#include <thread>
#include <cstring> #include <cstring>
#include <filesystem> #include <filesystem>
#include <fstream>
#include <iostream>
#include <thread>
#include "definitions.h"
WatchdogTask::WatchdogTask (): fd(0) { WatchdogTask::WatchdogTask() : fd(0) {
int result = 0; int result = 0;
// Only create the FIFO if it does not exist yet // Only create the FIFO if it does not exist yet
if(not std::filesystem::exists(watchdog::FIFO_NAME)) { if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
// Permission 666 or rw-rw-rw- // Permission 666 or rw-rw-rw-
mode_t mode = DEFFILEMODE; mode_t mode = DEFFILEMODE;
result = mkfifo(watchdog::FIFO_NAME.c_str(), mode); result = mkfifo(watchdog::FIFO_NAME.c_str(), mode);
if(result != 0) { if (result != 0) {
std::cerr << "eive-watchdog: Could not created named pipe at " << std::cerr << "eive-watchdog: Could not created named pipe at " << watchdog::FIFO_NAME
watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) << << ", error " << errno << ": " << strerror(errno) << std::endl;
std::endl;
throw std::runtime_error("eive-watchdog: FIFO creation failed"); throw std::runtime_error("eive-watchdog: FIFO creation failed");
} }
#if WATCHDOG_VERBOSE_LEVEL >= 1 #if WATCHDOG_VERBOSE_LEVEL >= 1
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << " created successfully"
" created successfully" << std::endl; << std::endl;
#endif #endif
} }
} }
WatchdogTask::~WatchdogTask() { WatchdogTask::~WatchdogTask() {}
}
int WatchdogTask::performOperation() { int WatchdogTask::performOperation() {
// Open FIFO read only and non-blocking // Open FIFO read only and non-blocking
fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK); fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK);
if(fd < 0) { if (fd < 0) {
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << "read-only failed with "
"read-only failed with " << errno << ": " << strerror(errno) << std::endl; << errno << ": " << strerror(errno) << std::endl;
return -1; return -1;
} }
state = States::RUNNING; state = States::NOT_STARTED;
while(true) { while (true) {
WatchdogTask::LoopResult loopResult = watchdogLoop(); WatchdogTask::LoopResult loopResult = watchdogLoop();
switch(loopResult) { if (not stateMachine(loopResult)) {
case(LoopResult::OK): {
performRunningOperation();
break; break;
} }
case(LoopResult::CANCEL_RQ): {
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
return 0;
}
case(LoopResult::SUSPEND_RQ): {
performSuspendOperation();
break;
}
case(LoopResult::TIMEOUT): {
performNotRunningOperation(loopResult);
break;
}
case(LoopResult::HUNG_UP): {
performNotRunningOperation(loopResult);
break;
}
case(LoopResult::RESTART_RQ): {
if(state == States::SUSPENDED or state == States::FAULTY) {
performRunningOperation();
}
break;
}
case(LoopResult::FAULT): {
using namespace std::chrono_literals;
// Configuration error
std::cerr << "Fault has occured in watchdog loop" << std::endl;
// Prevent spam
std::this_thread::sleep_for(2000ms);
}
}
} }
if (close(fd) < 0) { if (close(fd) < 0) {
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << "failed, error "
"failed, error " << errno << ": " << strerror(errno) << std::endl; << errno << ": " << strerror(errno) << std::endl;
} }
std::cout << "eive-watchdog: Finished" << std::endl; std::cout << "eive-watchdog: Finished" << std::endl;
return 0; return 0;
@ -102,38 +67,17 @@ WatchdogTask::LoopResult WatchdogTask::watchdogLoop() {
waiter.fd = fd; waiter.fd = fd;
waiter.events = POLLIN; waiter.events = POLLIN;
switch(state) { // Only poll one file descriptor with timeout
case(States::SUSPENDED): { switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
// Sleep, then check whether a restart request was received case (0): {
std::this_thread::sleep_for(1000ms);
break;
}
case(States::RUNNING): {
// Continue as usual
break;
}
case(States::NOT_STARTED): {
// This should not happen
std::cerr << "eive-watchdog: State is NOT_STARTED, configuration error" << std::endl;
break;
}
case(States::FAULTY): {
// TODO: Not sure what to do yet. Continue for now
break;
}
}
// 10 seconds timeout, only poll one file descriptor
switch(poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
case(0): {
return LoopResult::TIMEOUT; return LoopResult::TIMEOUT;
} }
case(1): { case (1): {
return pollEvent(waiter); return pollEvent(waiter);
} }
default: { default: {
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error " << std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error "
errno << ": " << strerror(errno) << std::endl; << errno << ": " << strerror(errno) << std::endl;
break; break;
} }
} }
@ -144,58 +88,55 @@ WatchdogTask::LoopResult WatchdogTask::pollEvent(struct pollfd& waiter) {
if (waiter.revents & POLLIN) { if (waiter.revents & POLLIN) {
ssize_t readLen = read(fd, buf.data(), buf.size()); ssize_t readLen = read(fd, buf.data(), buf.size());
if (readLen < 0) { if (readLen < 0) {
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << ", error "
", error " << errno << ": " << strerror(errno) << std::endl; << errno << ": " << strerror(errno) << std::endl;
return LoopResult::OK; return LoopResult::OK;
} }
#if WATCHDOG_VERBOSE_LEVEL == 2 #if WATCHDOG_VERBOSE_LEVEL == 2
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME << std::endl;
<< std::endl;
#endif #endif
else if(readLen >= 1) { else if (readLen >= 1) {
return parseCommandByte(readLen); return parseCommand(readLen);
} }
} } else if (waiter.revents & POLLERR) {
else if(waiter.revents & POLLERR) { std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME << std::endl;
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME <<
std::endl;
return LoopResult::FAULT; return LoopResult::FAULT;
} } else if (waiter.revents & POLLHUP) {
else if (waiter.revents & POLLHUP) {
// Writer closed its end // Writer closed its end
return LoopResult::HUNG_UP; return LoopResult::HUNG_UP;
} }
return LoopResult::FAULT; return LoopResult::FAULT;
} }
WatchdogTask::LoopResult WatchdogTask::parseCommandByte(ssize_t readLen) { WatchdogTask::LoopResult WatchdogTask::parseCommand(ssize_t readLen) {
for(ssize_t idx = 0; idx < readLen; idx++) { char readChar = buf[0];
char readChar = buf[idx];
// Cancel request // Cancel request
if(readChar == watchdog::CANCEL_CHAR) { if (readChar == watchdog::first::CANCEL_CHAR) {
return LoopResult::CANCEL_RQ; return LoopResult::CANCEL_REQ;
} } else if (readChar == watchdog::first::SUSPEND_CHAR) {
// Begin request. Does not work if the operation was not suspended before
else if(readChar == watchdog::RESTART_CHAR) {
return LoopResult::RESTART_RQ;
}
// Suspend request // Suspend request
else if(readChar == watchdog::SUSPEND_CHAR) { return LoopResult::SUSPEND_REQ;
return LoopResult::SUSPEND_RQ; } else if (readChar == watchdog::first::START_CHAR) {
if (readLen == 2 and static_cast<char>(buf[1]) == watchdog::second::WATCH_FLAG) {
return LoopResult::START_WITH_WATCH_REQ;
}
return LoopResult::START_REQ;
} }
// Everything else: All working as expected // Everything else: All working as expected
}
return LoopResult::OK; return LoopResult::OK;
} }
int WatchdogTask::performRunningOperation() { int WatchdogTask::performRunningOperation() {
if(state != States::RUNNING) { if (state != States::RUNNING) {
state = States::RUNNING; state = States::RUNNING;
} }
if (notRunningStart.has_value()) {
notRunningStart = std::nullopt;
}
if(not obswRunning) { if (not obswRunning) {
if(printNotRunningLatch) { if (printNotRunningLatch) {
// Reset latch so user can see timeouts // Reset latch so user can see timeouts
printNotRunningLatch = false; printNotRunningLatch = false;
} }
@ -206,9 +147,8 @@ int WatchdogTask::performRunningOperation() {
std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl; std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl;
if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) { if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME); std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME);
if(not obswRunningFile.good()) { if (not obswRunningFile.good()) {
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl;
<< std::endl;
} }
} }
#endif #endif
@ -218,29 +158,41 @@ int WatchdogTask::performRunningOperation() {
int WatchdogTask::performNotRunningOperation(LoopResult type) { int WatchdogTask::performNotRunningOperation(LoopResult type) {
// Latch prevents spam on console // Latch prevents spam on console
if(not printNotRunningLatch) { if (not printNotRunningLatch) {
if(type == LoopResult::HUNG_UP) { if (type == LoopResult::HUNG_UP) {
std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl; std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl;
} } else {
else {
std::cout << "eive-watchdog: The FIFO timed out!" << std::endl; std::cout << "eive-watchdog: The FIFO timed out!" << std::endl;
} }
printNotRunningLatch = true; printNotRunningLatch = true;
} }
if(obswRunning) { if (not notRunningStart.has_value()) {
notRunningStart = std::chrono::system_clock::now();
}
if (obswRunning) {
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1 #if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) { if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str()); int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str());
if(result != 0) { if (result != 0) {
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno
errno << ": " << strerror(errno) << std::endl; << ": " << strerror(errno) << std::endl;
} }
} }
#endif #endif
obswRunning = false; obswRunning = false;
} }
if(type == LoopResult::HUNG_UP) {
if (watchingObsw) {
auto timeNotRunning = std::chrono::system_clock::now() - notRunningStart.value();
if (std::chrono::duration_cast<std::chrono::milliseconds>(timeNotRunning).count() >
watchdog::MAX_NOT_RUNNING_MS) {
std::cout << "Restarting OBSW" << std::endl;
std::system("systemctl restart obsw");
}
}
if (type == LoopResult::HUNG_UP) {
using namespace std::chrono_literals; using namespace std::chrono_literals;
// Prevent spam // Prevent spam
std::this_thread::sleep_for(2000ms); std::this_thread::sleep_for(2000ms);
@ -248,11 +200,77 @@ int WatchdogTask::performNotRunningOperation(LoopResult type) {
return 0; return 0;
} }
int WatchdogTask::performSuspendOperation() { bool WatchdogTask::stateMachine(LoopResult loopResult) {
if(state == States::RUNNING or state == States::FAULTY) { using namespace std::chrono_literals;
bool sleep = false;
switch (state) {
case (States::RUNNING): {
switch (loopResult) {
case (LoopResult::TIMEOUT):
case (LoopResult::HUNG_UP): {
performNotRunningOperation(loopResult);
break;
}
case (LoopResult::OK): {
performRunningOperation();
break;
}
case (LoopResult::SUSPEND_REQ): {
if (state == States::RUNNING or state == States::FAULTY) {
std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl; std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl;
watchdogRunning = false;
state = States::SUSPENDED; state = States::SUSPENDED;
} }
return 0; performSuspendOperation();
sleep = true;
break;
}
case (LoopResult::CANCEL_REQ): {
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
return false;
}
}
}
case (States::FAULTY):
case (States::SUSPENDED):
case (States::NOT_STARTED): {
switch (loopResult) {
case (LoopResult::SUSPEND_REQ): {
// Ignore and also delay
sleep = true;
break;
}
case (LoopResult::START_REQ):
case (LoopResult::START_WITH_WATCH_REQ): {
if (state == States::NOT_STARTED or state == States::FAULTY) {
state = States::RUNNING;
}
std::cout << "Watch request received. Restarting OBSW if not running for "
<< watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl;
if (loopResult == LoopResult::START_REQ) {
watchingObsw = false;
} else if (loopResult == LoopResult::START_WITH_WATCH_REQ) {
watchingObsw = true;
}
performRunningOperation();
break;
}
default: {
sleep = true;
}
}
break;
}
}
if (loopResult == LoopResult::FAULT) {
// Configuration error
std::cerr << "Fault has occured in watchdog loop" << std::endl;
// Prevent spam
sleep = true;
}
if (sleep) {
std::this_thread::sleep_for(1000ms);
}
return true;
} }
int WatchdogTask::performSuspendOperation() { return 0; }

View File

@ -2,23 +2,21 @@
#define WATCHDOG_WATCHDOG_H_ #define WATCHDOG_WATCHDOG_H_
#include <array> #include <array>
#include <chrono>
#include <cstdint> #include <cstdint>
#include <optional>
#include <string> #include <string>
class WatchdogTask { class WatchdogTask {
public: public:
enum class States { enum class States { NOT_STARTED, RUNNING, SUSPENDED, FAULTY };
NOT_STARTED,
RUNNING,
SUSPENDED,
FAULTY
};
enum class LoopResult { enum class LoopResult {
OK, OK,
SUSPEND_RQ, START_REQ,
CANCEL_RQ, START_WITH_WATCH_REQ,
RESTART_RQ, SUSPEND_REQ,
CANCEL_REQ,
TIMEOUT, TIMEOUT,
HUNG_UP, HUNG_UP,
FAULT FAULT
@ -29,18 +27,23 @@ public:
virtual ~WatchdogTask(); virtual ~WatchdogTask();
int performOperation(); int performOperation();
private:
private:
int fd = 0; int fd = 0;
bool obswRunning = false; bool obswRunning = false;
bool watchdogRunning = false; bool watchingObsw = false;
bool printNotRunningLatch = false; bool printNotRunningLatch = false;
std::array<uint8_t, 64> buf; std::array<uint8_t, 64> buf;
std::optional<std::chrono::time_point<std::chrono::system_clock>> notRunningStart;
States state = States::NOT_STARTED; States state = States::NOT_STARTED;
// Primary loop. Takes care of delaying, and reading from the communication pipe and translating
// messages to loop results.
LoopResult watchdogLoop(); LoopResult watchdogLoop();
bool stateMachine(LoopResult result);
LoopResult pollEvent(struct pollfd& waiter); LoopResult pollEvent(struct pollfd& waiter);
LoopResult parseCommandByte(ssize_t readLen); LoopResult parseCommand(ssize_t readLen);
int performRunningOperation(); int performRunningOperation();
int performNotRunningOperation(LoopResult type); int performNotRunningOperation(LoopResult type);

View File

@ -5,17 +5,31 @@
namespace watchdog { namespace watchdog {
namespace first {
// Start or restart character
static constexpr char START_CHAR = 'b';
// Suspend watchdog operations temporarily // Suspend watchdog operations temporarily
static constexpr char SUSPEND_CHAR = 's'; static constexpr char SUSPEND_CHAR = 's';
// Resume watchdog operations
static constexpr char RESTART_CHAR = 'b';
// Causes the watchdog to close down // Causes the watchdog to close down
static constexpr char CANCEL_CHAR = 'c'; static constexpr char CANCEL_CHAR = 'c';
static constexpr char IDLE_CHAR = 'i';
} // namespace first
namespace second {
// Supplied with the start character. This will instruct the watchdog to actually watch
// the OBSW is runnng all the time.
static constexpr char WATCH_FLAG = 'w';
} // namespace second
static constexpr int TIMEOUT_MS = 5 * 1000; static constexpr int TIMEOUT_MS = 5 * 1000;
// 2 minutes
static constexpr unsigned MAX_NOT_RUNNING_MS = 2 * 60 * 1000;
const std::string FIFO_NAME = "/tmp/watchdog-pipe"; const std::string FIFO_NAME = "/tmp/watchdog-pipe";
const std::string RUNNING_FILE_NAME = "/tmp/obsw-running"; const std::string RUNNING_FILE_NAME = "/tmp/obsw-running";
} } // namespace watchdog
#endif /* WATCHDOG_DEFINITIONS_H_ */ #endif /* WATCHDOG_DEFINITIONS_H_ */

View File

@ -1,7 +1,7 @@
#include "Watchdog.h"
#include <iostream> #include <iostream>
#include "Watchdog.h"
/** /**
* @brief This watchdog application uses a FIFO to check whether the OBSW is still running. * @brief This watchdog application uses a FIFO to check whether the OBSW is still running.
* It checks whether the OBSW writes to the the FIFO regularly. * It checks whether the OBSW writes to the the FIFO regularly.
@ -11,14 +11,12 @@ int main() {
try { try {
WatchdogTask watchdogTask; WatchdogTask watchdogTask;
int result = watchdogTask.performOperation(); int result = watchdogTask.performOperation();
if(result != 0) { if (result != 0) {
return result; return result;
} }
} } catch (const std::runtime_error& e) {
catch(const std::runtime_error& e) {
std::cerr << "eive-watchdog: Run time exception " << e.what() << std::endl; std::cerr << "eive-watchdog: Run time exception " << e.what() << std::endl;
return -1; return -1;
} }
return 0; return 0;
} }