reworked watchdog
This commit is contained in:
parent
e33a0fd60b
commit
f789380343
12
CHANGELOG.md
12
CHANGELOG.md
@ -16,6 +16,18 @@ will consitute of a breaking change warranting a new major release:
|
||||
|
||||
# [unreleased]
|
||||
|
||||
## Changed
|
||||
|
||||
- Improved the OBSW watchdog by adding a watch functionality. The watch functionality is optional
|
||||
and has to be enabled specifically by the application being watched by the watchdog when
|
||||
starting the watchdog. If the watch functionality is enabled and the OBSW has not pinged
|
||||
the watchdog via the FIFO for 2 minutes, the watchdog will restart the OBSW service via systemd.
|
||||
The primary OBSW will only activate the watch functionality if it is the OBSW inside the
|
||||
`/usr/bin` directory. This allows debugging the system by leaving flashed or manually copied
|
||||
debugging images 2 minutes to start the watchdog without the watch functionality.
|
||||
- The SD card prefix is now set earlier inside the `CoreController` constructor
|
||||
- The watchdog handling was moved outside the `CoreController` into the main loop.
|
||||
|
||||
# [v1.31.1]
|
||||
|
||||
## Fixed
|
||||
|
@ -1,4 +1,4 @@
|
||||
target_sources(${OBSW_NAME} PRIVATE CoreController.cpp scheduling.cpp
|
||||
ObjectFactory.cpp)
|
||||
ObjectFactory.cpp WatchdogHandler.cpp)
|
||||
|
||||
target_sources(${SIMPLE_OBSW_NAME} PRIVATE scheduling.cpp)
|
||||
|
@ -33,12 +33,7 @@ xsc::Copy CoreController::CURRENT_COPY = xsc::Copy::NO_COPY;
|
||||
|
||||
CoreController::CoreController(object_id_t objectId)
|
||||
: ExtendedControllerBase(objectId, 5), opDivider5(5), opDivider10(10), hkSet(this) {
|
||||
ReturnValue_t result = returnvalue::OK;
|
||||
try {
|
||||
result = initWatchdogFifo();
|
||||
if (result != returnvalue::OK) {
|
||||
sif::warning << "CoreController::CoreController: Watchdog FIFO init failed" << std::endl;
|
||||
}
|
||||
sdcMan = SdCardManager::instance();
|
||||
if (sdcMan == nullptr) {
|
||||
sif::error << "CoreController::CoreController: SD card manager invalid!" << std::endl;
|
||||
@ -47,11 +42,25 @@ CoreController::CoreController(object_id_t objectId)
|
||||
if (not BLOCKING_SD_INIT) {
|
||||
sdcMan->setBlocking(false);
|
||||
}
|
||||
auto sdCard = sdcMan->getPreferredSdCard();
|
||||
if (not sdCard.has_value()) {
|
||||
sif::error << "CoreController::initializeAfterTaskCreation: "
|
||||
"Issues getting preferred SD card, setting to 0"
|
||||
<< std::endl;
|
||||
sdCard = sd::SdCard::SLOT_0;
|
||||
}
|
||||
sdInfo.active = sdCard.value();
|
||||
sdcMan->setActiveSdCard(sdInfo.active);
|
||||
currMntPrefix = sdcMan->getCurrentMountPrefix();
|
||||
|
||||
getCurrentBootCopy(CURRENT_CHIP, CURRENT_COPY);
|
||||
} catch (const std::filesystem::filesystem_error &e) {
|
||||
sif::error << "CoreController::CoreController: Failed with exception " << e.what() << std::endl;
|
||||
}
|
||||
// Add script folder to path
|
||||
char *currentEnvPath = getenv("PATH");
|
||||
std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin";
|
||||
setenv("PATH", updatedEnvPath.c_str(), true);
|
||||
sdCardCheckCd.timeOut();
|
||||
eventQueue = QueueFactory::instance()->createMessageQueue(5, EventMessage::MAX_MESSAGE_SIZE);
|
||||
}
|
||||
@ -76,7 +85,6 @@ void CoreController::performControlOperation() {
|
||||
}
|
||||
}
|
||||
}
|
||||
performWatchdogControlOperation();
|
||||
sdStateMachine();
|
||||
performMountedSdCardOperations();
|
||||
if (sdCardCheckCd.hasTimedOut()) {
|
||||
@ -146,19 +154,6 @@ ReturnValue_t CoreController::initialize() {
|
||||
|
||||
ReturnValue_t CoreController::initializeAfterTaskCreation() {
|
||||
ReturnValue_t result = returnvalue::OK;
|
||||
auto sdCard = sdcMan->getPreferredSdCard();
|
||||
if (not sdCard) {
|
||||
return returnvalue::FAILED;
|
||||
}
|
||||
sdInfo.active = sdCard.value();
|
||||
if (sdInfo.active == sd::SdCard::NONE) {
|
||||
sif::error << "CoreController::initializeAfterTaskCreation: "
|
||||
"Issues getting preferred SD card, setting to 0"
|
||||
<< std::endl;
|
||||
sdInfo.active = sd::SdCard::SLOT_0;
|
||||
}
|
||||
sdcMan->setActiveSdCard(sdInfo.active);
|
||||
currMntPrefix = sdcMan->getCurrentMountPrefix();
|
||||
if (BLOCKING_SD_INIT) {
|
||||
result = initSdCardBlocking();
|
||||
if (result != returnvalue::OK and result != SdCardManager::ALREADY_MOUNTED) {
|
||||
@ -170,12 +165,7 @@ ReturnValue_t CoreController::initializeAfterTaskCreation() {
|
||||
if (result != returnvalue::OK) {
|
||||
sif::warning << "CoreController::initialize: Version initialization failed" << std::endl;
|
||||
}
|
||||
// Add script folder to path
|
||||
char *currentEnvPath = getenv("PATH");
|
||||
std::string updatedEnvPath = std::string(currentEnvPath) + ":/home/root/scripts:/usr/local/bin";
|
||||
setenv("PATH", updatedEnvPath.c_str(), true);
|
||||
updateProtInfo();
|
||||
initPrint();
|
||||
return ExtendedControllerBase::initializeAfterTaskCreation();
|
||||
}
|
||||
|
||||
@ -839,36 +829,6 @@ void CoreController::getCurrentBootCopy(xsc::Chip &chip, xsc::Copy ©) {
|
||||
copy = static_cast<xsc::Copy>(xscCopy);
|
||||
}
|
||||
|
||||
ReturnValue_t CoreController::initWatchdogFifo() {
|
||||
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
|
||||
// Still return returnvalue::OK for now
|
||||
sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate"
|
||||
<< " watchdog" << std::endl;
|
||||
return returnvalue::OK;
|
||||
}
|
||||
// Open FIFO write only and non-blocking to prevent SW from killing itself.
|
||||
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
|
||||
if (watchdogFifoFd < 0) {
|
||||
if (errno == ENXIO) {
|
||||
watchdogFifoFd = RETRY_FIFO_OPEN;
|
||||
sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl;
|
||||
} else {
|
||||
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno
|
||||
<< ": " << strerror(errno) << std::endl;
|
||||
return returnvalue::FAILED;
|
||||
}
|
||||
}
|
||||
return returnvalue::OK;
|
||||
}
|
||||
|
||||
void CoreController::initPrint() {
|
||||
#if OBSW_VERBOSE_LEVEL >= 1
|
||||
if (watchdogFifoFd > 0) {
|
||||
sif::info << "Opened watchdog FIFO successfully.." << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ReturnValue_t CoreController::actionXscReboot(const uint8_t *data, size_t size) {
|
||||
if (size < 1) {
|
||||
return HasActionsIF::INVALID_PARAMETERS;
|
||||
@ -1223,36 +1183,6 @@ ReturnValue_t CoreController::handleProtInfoUpdateLine(std::string nextLine) {
|
||||
return returnvalue::OK;
|
||||
}
|
||||
|
||||
void CoreController::performWatchdogControlOperation() {
|
||||
// Only perform each fifth iteration
|
||||
if (watchdogFifoFd != 0 and opDivider5.check()) {
|
||||
if (watchdogFifoFd == RETRY_FIFO_OPEN) {
|
||||
// Open FIFO write only and non-blocking
|
||||
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
|
||||
if (watchdogFifoFd < 0) {
|
||||
if (errno == ENXIO) {
|
||||
watchdogFifoFd = RETRY_FIFO_OPEN;
|
||||
// No printout for now, would be spam
|
||||
return;
|
||||
} else {
|
||||
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl;
|
||||
} else if (watchdogFifoFd > 0) {
|
||||
// Write to OBSW watchdog FIFO here
|
||||
const char writeChar = 'a';
|
||||
ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1);
|
||||
if (writtenBytes < 0) {
|
||||
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CoreController::performMountedSdCardOperations() {
|
||||
auto mountedSdCardOp = [&](sd::SdCard sdCard, std::string mntPoint) {
|
||||
if (not performOneShotSdCardOpsSwitch) {
|
||||
|
@ -162,9 +162,6 @@ class CoreController : public ExtendedControllerBase {
|
||||
private:
|
||||
static constexpr MutexIF::TimeoutType TIMEOUT_TYPE = MutexIF::TimeoutType::WAITING;
|
||||
static constexpr uint32_t MUTEX_TIMEOUT = 20;
|
||||
// Designated value for rechecking FIFO open
|
||||
static constexpr int RETRY_FIFO_OPEN = -2;
|
||||
int watchdogFifoFd = 0;
|
||||
GpsHyperion::FixMode gpsFix = GpsHyperion::FixMode::UNKNOWN;
|
||||
|
||||
// States for SD state machine, which is used in non-blocking mode
|
||||
@ -260,7 +257,6 @@ class CoreController : public ExtendedControllerBase {
|
||||
ReturnValue_t performSdCardCheck();
|
||||
ReturnValue_t timeFileHandler();
|
||||
ReturnValue_t initBootCopyFile();
|
||||
ReturnValue_t initWatchdogFifo();
|
||||
ReturnValue_t initSdCardBlocking();
|
||||
bool startSdStateMachine(sd::SdCard targetActiveSd, SdCfgMode mode, MessageQueueId_t commander,
|
||||
DeviceCommandId_t actionId);
|
||||
@ -285,8 +281,6 @@ class CoreController : public ExtendedControllerBase {
|
||||
|
||||
ReturnValue_t gracefulShutdownTasks(xsc::Chip chip, xsc::Copy copy, bool& protOpPerformed);
|
||||
|
||||
void performWatchdogControlOperation();
|
||||
|
||||
ReturnValue_t handleProtInfoUpdateLine(std::string nextLine);
|
||||
int handleBootCopyProtAtIndex(xsc::Chip targetChip, xsc::Copy targetCopy, bool protect,
|
||||
bool& protOperationPerformed, bool selfChip, bool selfCopy,
|
||||
|
84
bsp_q7s/core/WatchdogHandler.cpp
Normal file
84
bsp_q7s/core/WatchdogHandler.cpp
Normal file
@ -0,0 +1,84 @@
|
||||
#include "WatchdogHandler.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
|
||||
#include "fsfw/serviceinterface.h"
|
||||
#include "watchdog/definitions.h"
|
||||
|
||||
WatchdogHandler::WatchdogHandler() {}
|
||||
|
||||
void WatchdogHandler::periodicOperation() {
|
||||
if (watchdogFifoFd != 0) {
|
||||
if (watchdogFifoFd == RETRY_FIFO_OPEN) {
|
||||
// Open FIFO write only and non-blocking
|
||||
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
|
||||
if (watchdogFifoFd < 0) {
|
||||
if (errno == ENXIO) {
|
||||
watchdogFifoFd = RETRY_FIFO_OPEN;
|
||||
// No printout for now, would be spam
|
||||
return;
|
||||
} else {
|
||||
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
sif::info << "Opened " << watchdog::FIFO_NAME << " successfully" << std::endl;
|
||||
performStartHandling();
|
||||
} else if (watchdogFifoFd > 0) {
|
||||
// Write to OBSW watchdog FIFO here
|
||||
const char writeChar = watchdog::first::IDLE_CHAR;
|
||||
ssize_t writtenBytes = write(watchdogFifoFd, &writeChar, 1);
|
||||
if (writtenBytes < 0) {
|
||||
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ReturnValue_t WatchdogHandler::initialize(bool enableWatchdogFunction) {
|
||||
using namespace std::filesystem;
|
||||
this->enableWatchFunction = enableWatchdogFunction;
|
||||
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
|
||||
// Still return returnvalue::OK for now
|
||||
sif::info << "Watchdog FIFO " << watchdog::FIFO_NAME << " does not exist, can't initiate"
|
||||
<< " watchdog" << std::endl;
|
||||
return returnvalue::OK;
|
||||
}
|
||||
// Open FIFO write only and non-blocking to prevent SW from killing itself.
|
||||
watchdogFifoFd = open(watchdog::FIFO_NAME.c_str(), O_WRONLY | O_NONBLOCK);
|
||||
if (watchdogFifoFd < 0) {
|
||||
if (errno == ENXIO) {
|
||||
watchdogFifoFd = RETRY_FIFO_OPEN;
|
||||
sif::info << "eive-watchdog not running. FIFO can not be opened" << std::endl;
|
||||
} else {
|
||||
sif::error << "Opening pipe " << watchdog::FIFO_NAME << " write-only failed with " << errno
|
||||
<< ": " << strerror(errno) << std::endl;
|
||||
return returnvalue::FAILED;
|
||||
}
|
||||
}
|
||||
return performStartHandling();
|
||||
}
|
||||
|
||||
ReturnValue_t WatchdogHandler::performStartHandling() {
|
||||
char startBuf[2];
|
||||
size_t writeLen = 1;
|
||||
startBuf[0] = watchdog::first::START_CHAR;
|
||||
if (enableWatchFunction) {
|
||||
writeLen += 1;
|
||||
startBuf[1] = watchdog::second::WATCH_FLAG;
|
||||
}
|
||||
ssize_t writtenBytes = write(watchdogFifoFd, &startBuf, writeLen);
|
||||
if (writtenBytes < 0) {
|
||||
sif::error << "Errors writing to watchdog FIFO, code " << errno << ": " << strerror(errno)
|
||||
<< std::endl;
|
||||
return returnvalue::FAILED;
|
||||
}
|
||||
return returnvalue::OK;
|
||||
}
|
23
bsp_q7s/core/WatchdogHandler.h
Normal file
23
bsp_q7s/core/WatchdogHandler.h
Normal file
@ -0,0 +1,23 @@
|
||||
#ifndef BSP_Q7S_CORE_WATCHDOGHANDLER_H_
|
||||
#define BSP_Q7S_CORE_WATCHDOGHANDLER_H_
|
||||
|
||||
#include "fsfw/returnvalues/returnvalue.h"
|
||||
|
||||
class WatchdogHandler {
|
||||
public:
|
||||
WatchdogHandler();
|
||||
|
||||
ReturnValue_t initialize(bool enableWatchFunction);
|
||||
void periodicOperation();
|
||||
|
||||
private:
|
||||
// Designated value for rechecking FIFO open
|
||||
static constexpr int RETRY_FIFO_OPEN = -2;
|
||||
|
||||
int watchdogFifoFd = 0;
|
||||
bool enableWatchFunction = false;
|
||||
|
||||
ReturnValue_t performStartHandling();
|
||||
};
|
||||
|
||||
#endif /* BSP_Q7S_CORE_WATCHDOGHANDLER_H_ */
|
@ -9,6 +9,7 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "OBSWConfig.h"
|
||||
#include "bsp_q7s/core/WatchdogHandler.h"
|
||||
#include "commonConfig.h"
|
||||
#include "core/scheduling.h"
|
||||
#include "fsfw/tasks/TaskFactory.h"
|
||||
@ -24,6 +25,9 @@ static const char* DEV_STRING = "Xiphos Q7S FM";
|
||||
#else
|
||||
static const char* DEV_STRING = "Xiphos Q7S EM";
|
||||
#endif
|
||||
|
||||
WatchdogHandler WATCHDOG_HANDLER;
|
||||
|
||||
int obsw::obsw() {
|
||||
using namespace fsfw;
|
||||
std::cout << "-- EIVE OBSW --" << std::endl;
|
||||
@ -44,6 +48,35 @@ int obsw::obsw() {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Delay the boot if applicable.
|
||||
bootDelayHandling();
|
||||
|
||||
bool initWatchFunction = false;
|
||||
if (std::filesystem::current_path() == "/usr/bin") {
|
||||
initWatchFunction = true;
|
||||
}
|
||||
ReturnValue_t result = WATCHDOG_HANDLER.initialize(initWatchFunction);
|
||||
if (result != returnvalue::OK) {
|
||||
std::cerr << "Initiating EIVE watchdog handler failed" << std::endl;
|
||||
}
|
||||
|
||||
scheduling::initMission();
|
||||
|
||||
// Command the EIVE system to safe mode
|
||||
#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1
|
||||
commandEiveSystemToSafe();
|
||||
#else
|
||||
announceAllModes();
|
||||
#endif
|
||||
|
||||
for (;;) {
|
||||
WATCHDOG_HANDLER.periodicOperation();
|
||||
TaskFactory::delayTask(1000);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void obsw::bootDelayHandling() {
|
||||
const char* homedir = nullptr;
|
||||
homedir = getenv("HOME");
|
||||
if (homedir == nullptr) {
|
||||
@ -71,31 +104,26 @@ int obsw::obsw() {
|
||||
std::cout << "Delaying OBSW start for " << bootDelaySecs << " seconds" << std::endl;
|
||||
TaskFactory::delayTask(bootDelaySecs * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
scheduling::initMission();
|
||||
|
||||
// Command the EIVE system to safe mode
|
||||
void obsw::commandEiveSystemToSafe() {
|
||||
auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue();
|
||||
CommandMessage msg;
|
||||
#if OBSW_COMMAND_SAFE_MODE_AT_STARTUP == 1
|
||||
ModeMessage::setCmdModeMessage(msg, acs::AcsMode::SAFE, 0);
|
||||
ReturnValue_t result =
|
||||
MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false);
|
||||
if (result != returnvalue::OK) {
|
||||
sif::error << "Sending safe mode command to EIVE system failed" << std::endl;
|
||||
}
|
||||
#else
|
||||
}
|
||||
|
||||
void obsw::announceAllModes() {
|
||||
auto sysQueueId = satsystem::EIVE_SYSTEM.getCommandQueue();
|
||||
CommandMessage msg;
|
||||
ModeMessage::setModeAnnounceMessage(msg, true);
|
||||
ReturnValue_t result =
|
||||
MessageQueueSenderIF::sendMessage(sysQueueId, &msg, MessageQueueIF::NO_QUEUE, false);
|
||||
if (result != returnvalue::OK) {
|
||||
sif::error << "Sending safe mode command to EIVE system failed" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (;;) {
|
||||
/* Suspend main thread by sleeping it. */
|
||||
TaskFactory::delayTask(5000);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -5,6 +5,10 @@ namespace obsw {
|
||||
|
||||
int obsw();
|
||||
|
||||
};
|
||||
void bootDelayHandling();
|
||||
void commandEiveSystemToSafe();
|
||||
void announceAllModes();
|
||||
|
||||
}; // namespace obsw
|
||||
|
||||
#endif /* BSP_Q7S_CORE_OBSW_H_ */
|
||||
|
@ -4,6 +4,7 @@ if [[ ! -f README.md ]]; then
|
||||
fi
|
||||
|
||||
folder_list=(
|
||||
"./watchdog"
|
||||
"./mission"
|
||||
"./linux"
|
||||
"./bsp_q7s"
|
||||
|
@ -1,10 +1,5 @@
|
||||
target_sources(${WATCHDOG_NAME} PRIVATE
|
||||
main.cpp
|
||||
Watchdog.cpp
|
||||
)
|
||||
target_sources(${WATCHDOG_NAME} PRIVATE main.cpp Watchdog.cpp)
|
||||
|
||||
target_include_directories(${WATCHDOG_NAME} PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
target_include_directories(${WATCHDOG_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
install(TARGETS ${WATCHDOG_NAME} RUNTIME DESTINATION bin)
|
||||
|
@ -1,96 +1,61 @@
|
||||
#include "Watchdog.h"
|
||||
#include "definitions.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <poll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <poll.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
|
||||
#include "definitions.h"
|
||||
|
||||
WatchdogTask::WatchdogTask (): fd(0) {
|
||||
WatchdogTask::WatchdogTask() : fd(0) {
|
||||
int result = 0;
|
||||
// Only create the FIFO if it does not exist yet
|
||||
if(not std::filesystem::exists(watchdog::FIFO_NAME)) {
|
||||
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
|
||||
// Permission 666 or rw-rw-rw-
|
||||
mode_t mode = DEFFILEMODE;
|
||||
result = mkfifo(watchdog::FIFO_NAME.c_str(), mode);
|
||||
if(result != 0) {
|
||||
std::cerr << "eive-watchdog: Could not created named pipe at " <<
|
||||
watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) <<
|
||||
std::endl;
|
||||
if (result != 0) {
|
||||
std::cerr << "eive-watchdog: Could not created named pipe at " << watchdog::FIFO_NAME
|
||||
<< ", error " << errno << ": " << strerror(errno) << std::endl;
|
||||
throw std::runtime_error("eive-watchdog: FIFO creation failed");
|
||||
}
|
||||
#if WATCHDOG_VERBOSE_LEVEL >= 1
|
||||
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME <<
|
||||
" created successfully" << std::endl;
|
||||
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << " created successfully"
|
||||
<< std::endl;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
WatchdogTask::~WatchdogTask() {
|
||||
|
||||
}
|
||||
WatchdogTask::~WatchdogTask() {}
|
||||
|
||||
int WatchdogTask::performOperation() {
|
||||
// Open FIFO read only and non-blocking
|
||||
fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK);
|
||||
if(fd < 0) {
|
||||
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME <<
|
||||
"read-only failed with " << errno << ": " << strerror(errno) << std::endl;
|
||||
if (fd < 0) {
|
||||
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << "read-only failed with "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
state = States::RUNNING;
|
||||
state = States::NOT_STARTED;
|
||||
|
||||
while(true) {
|
||||
while (true) {
|
||||
WatchdogTask::LoopResult loopResult = watchdogLoop();
|
||||
switch(loopResult) {
|
||||
case(LoopResult::OK): {
|
||||
performRunningOperation();
|
||||
if (not stateMachine(loopResult)) {
|
||||
break;
|
||||
}
|
||||
case(LoopResult::CANCEL_RQ): {
|
||||
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
case(LoopResult::SUSPEND_RQ): {
|
||||
performSuspendOperation();
|
||||
break;
|
||||
}
|
||||
case(LoopResult::TIMEOUT): {
|
||||
performNotRunningOperation(loopResult);
|
||||
break;
|
||||
}
|
||||
case(LoopResult::HUNG_UP): {
|
||||
performNotRunningOperation(loopResult);
|
||||
break;
|
||||
}
|
||||
case(LoopResult::RESTART_RQ): {
|
||||
if(state == States::SUSPENDED or state == States::FAULTY) {
|
||||
performRunningOperation();
|
||||
}
|
||||
break;
|
||||
}
|
||||
case(LoopResult::FAULT): {
|
||||
using namespace std::chrono_literals;
|
||||
// Configuration error
|
||||
std::cerr << "Fault has occured in watchdog loop" << std::endl;
|
||||
// Prevent spam
|
||||
std::this_thread::sleep_for(2000ms);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
if (close(fd) < 0) {
|
||||
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME <<
|
||||
"failed, error " << errno << ": " << strerror(errno) << std::endl;
|
||||
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << "failed, error "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
}
|
||||
std::cout << "eive-watchdog: Finished" << std::endl;
|
||||
return 0;
|
||||
@ -102,38 +67,17 @@ WatchdogTask::LoopResult WatchdogTask::watchdogLoop() {
|
||||
waiter.fd = fd;
|
||||
waiter.events = POLLIN;
|
||||
|
||||
switch(state) {
|
||||
case(States::SUSPENDED): {
|
||||
// Sleep, then check whether a restart request was received
|
||||
std::this_thread::sleep_for(1000ms);
|
||||
break;
|
||||
}
|
||||
case(States::RUNNING): {
|
||||
// Continue as usual
|
||||
break;
|
||||
}
|
||||
case(States::NOT_STARTED): {
|
||||
// This should not happen
|
||||
std::cerr << "eive-watchdog: State is NOT_STARTED, configuration error" << std::endl;
|
||||
break;
|
||||
}
|
||||
case(States::FAULTY): {
|
||||
// TODO: Not sure what to do yet. Continue for now
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 10 seconds timeout, only poll one file descriptor
|
||||
switch(poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
|
||||
case(0): {
|
||||
// Only poll one file descriptor with timeout
|
||||
switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
|
||||
case (0): {
|
||||
return LoopResult::TIMEOUT;
|
||||
}
|
||||
case(1): {
|
||||
case (1): {
|
||||
return pollEvent(waiter);
|
||||
}
|
||||
default: {
|
||||
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error " <<
|
||||
errno << ": " << strerror(errno) << std::endl;
|
||||
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -144,58 +88,55 @@ WatchdogTask::LoopResult WatchdogTask::pollEvent(struct pollfd& waiter) {
|
||||
if (waiter.revents & POLLIN) {
|
||||
ssize_t readLen = read(fd, buf.data(), buf.size());
|
||||
if (readLen < 0) {
|
||||
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME <<
|
||||
", error " << errno << ": " << strerror(errno) << std::endl;
|
||||
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << ", error "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
return LoopResult::OK;
|
||||
}
|
||||
#if WATCHDOG_VERBOSE_LEVEL == 2
|
||||
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME
|
||||
<< std::endl;
|
||||
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME << std::endl;
|
||||
#endif
|
||||
else if(readLen >= 1) {
|
||||
return parseCommandByte(readLen);
|
||||
else if (readLen >= 1) {
|
||||
return parseCommand(readLen);
|
||||
}
|
||||
|
||||
}
|
||||
else if(waiter.revents & POLLERR) {
|
||||
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME <<
|
||||
std::endl;
|
||||
} else if (waiter.revents & POLLERR) {
|
||||
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME << std::endl;
|
||||
return LoopResult::FAULT;
|
||||
}
|
||||
else if (waiter.revents & POLLHUP) {
|
||||
} else if (waiter.revents & POLLHUP) {
|
||||
// Writer closed its end
|
||||
return LoopResult::HUNG_UP;
|
||||
}
|
||||
return LoopResult::FAULT;
|
||||
}
|
||||
|
||||
WatchdogTask::LoopResult WatchdogTask::parseCommandByte(ssize_t readLen) {
|
||||
for(ssize_t idx = 0; idx < readLen; idx++) {
|
||||
char readChar = buf[idx];
|
||||
WatchdogTask::LoopResult WatchdogTask::parseCommand(ssize_t readLen) {
|
||||
char readChar = buf[0];
|
||||
// Cancel request
|
||||
if(readChar == watchdog::CANCEL_CHAR) {
|
||||
return LoopResult::CANCEL_RQ;
|
||||
}
|
||||
// Begin request. Does not work if the operation was not suspended before
|
||||
else if(readChar == watchdog::RESTART_CHAR) {
|
||||
return LoopResult::RESTART_RQ;
|
||||
}
|
||||
if (readChar == watchdog::first::CANCEL_CHAR) {
|
||||
return LoopResult::CANCEL_REQ;
|
||||
} else if (readChar == watchdog::first::SUSPEND_CHAR) {
|
||||
// Suspend request
|
||||
else if(readChar == watchdog::SUSPEND_CHAR) {
|
||||
return LoopResult::SUSPEND_RQ;
|
||||
return LoopResult::SUSPEND_REQ;
|
||||
} else if (readChar == watchdog::first::START_CHAR) {
|
||||
if (readLen == 2 and static_cast<char>(buf[1]) == watchdog::second::WATCH_FLAG) {
|
||||
return LoopResult::START_WITH_WATCH_REQ;
|
||||
}
|
||||
return LoopResult::START_REQ;
|
||||
}
|
||||
// Everything else: All working as expected
|
||||
}
|
||||
return LoopResult::OK;
|
||||
}
|
||||
|
||||
int WatchdogTask::performRunningOperation() {
|
||||
if(state != States::RUNNING) {
|
||||
if (state != States::RUNNING) {
|
||||
state = States::RUNNING;
|
||||
}
|
||||
if (notRunningStart.has_value()) {
|
||||
notRunningStart = std::nullopt;
|
||||
}
|
||||
|
||||
if(not obswRunning) {
|
||||
if(printNotRunningLatch) {
|
||||
if (not obswRunning) {
|
||||
if (printNotRunningLatch) {
|
||||
// Reset latch so user can see timeouts
|
||||
printNotRunningLatch = false;
|
||||
}
|
||||
@ -206,9 +147,8 @@ int WatchdogTask::performRunningOperation() {
|
||||
std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl;
|
||||
if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
|
||||
std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME);
|
||||
if(not obswRunningFile.good()) {
|
||||
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed"
|
||||
<< std::endl;
|
||||
if (not obswRunningFile.good()) {
|
||||
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -218,29 +158,41 @@ int WatchdogTask::performRunningOperation() {
|
||||
|
||||
int WatchdogTask::performNotRunningOperation(LoopResult type) {
|
||||
// Latch prevents spam on console
|
||||
if(not printNotRunningLatch) {
|
||||
if(type == LoopResult::HUNG_UP) {
|
||||
if (not printNotRunningLatch) {
|
||||
if (type == LoopResult::HUNG_UP) {
|
||||
std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
std::cout << "eive-watchdog: The FIFO timed out!" << std::endl;
|
||||
}
|
||||
printNotRunningLatch = true;
|
||||
}
|
||||
|
||||
if(obswRunning) {
|
||||
if (not notRunningStart.has_value()) {
|
||||
notRunningStart = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
if (obswRunning) {
|
||||
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
|
||||
if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
|
||||
int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str());
|
||||
if(result != 0) {
|
||||
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " <<
|
||||
errno << ": " << strerror(errno) << std::endl;
|
||||
if (result != 0) {
|
||||
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno
|
||||
<< ": " << strerror(errno) << std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
obswRunning = false;
|
||||
}
|
||||
if(type == LoopResult::HUNG_UP) {
|
||||
|
||||
if (watchingObsw) {
|
||||
auto timeNotRunning = std::chrono::system_clock::now() - notRunningStart.value();
|
||||
if (std::chrono::duration_cast<std::chrono::milliseconds>(timeNotRunning).count() >
|
||||
watchdog::MAX_NOT_RUNNING_MS) {
|
||||
std::cout << "Restarting OBSW" << std::endl;
|
||||
std::system("systemctl restart obsw");
|
||||
}
|
||||
}
|
||||
if (type == LoopResult::HUNG_UP) {
|
||||
using namespace std::chrono_literals;
|
||||
// Prevent spam
|
||||
std::this_thread::sleep_for(2000ms);
|
||||
@ -248,11 +200,77 @@ int WatchdogTask::performNotRunningOperation(LoopResult type) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WatchdogTask::performSuspendOperation() {
|
||||
if(state == States::RUNNING or state == States::FAULTY) {
|
||||
bool WatchdogTask::stateMachine(LoopResult loopResult) {
|
||||
using namespace std::chrono_literals;
|
||||
bool sleep = false;
|
||||
switch (state) {
|
||||
case (States::RUNNING): {
|
||||
switch (loopResult) {
|
||||
case (LoopResult::TIMEOUT):
|
||||
case (LoopResult::HUNG_UP): {
|
||||
performNotRunningOperation(loopResult);
|
||||
break;
|
||||
}
|
||||
case (LoopResult::OK): {
|
||||
performRunningOperation();
|
||||
break;
|
||||
}
|
||||
case (LoopResult::SUSPEND_REQ): {
|
||||
if (state == States::RUNNING or state == States::FAULTY) {
|
||||
std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl;
|
||||
watchdogRunning = false;
|
||||
state = States::SUSPENDED;
|
||||
}
|
||||
return 0;
|
||||
performSuspendOperation();
|
||||
sleep = true;
|
||||
break;
|
||||
}
|
||||
case (LoopResult::CANCEL_REQ): {
|
||||
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
case (States::FAULTY):
|
||||
case (States::SUSPENDED):
|
||||
case (States::NOT_STARTED): {
|
||||
switch (loopResult) {
|
||||
case (LoopResult::SUSPEND_REQ): {
|
||||
// Ignore and also delay
|
||||
sleep = true;
|
||||
break;
|
||||
}
|
||||
case (LoopResult::START_REQ):
|
||||
case (LoopResult::START_WITH_WATCH_REQ): {
|
||||
if (state == States::NOT_STARTED or state == States::FAULTY) {
|
||||
state = States::RUNNING;
|
||||
}
|
||||
std::cout << "Watch request received. Restarting OBSW if not running for "
|
||||
<< watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl;
|
||||
if (loopResult == LoopResult::START_REQ) {
|
||||
watchingObsw = false;
|
||||
} else if (loopResult == LoopResult::START_WITH_WATCH_REQ) {
|
||||
watchingObsw = true;
|
||||
}
|
||||
performRunningOperation();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
sleep = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (loopResult == LoopResult::FAULT) {
|
||||
// Configuration error
|
||||
std::cerr << "Fault has occured in watchdog loop" << std::endl;
|
||||
// Prevent spam
|
||||
sleep = true;
|
||||
}
|
||||
if (sleep) {
|
||||
std::this_thread::sleep_for(1000ms);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int WatchdogTask::performSuspendOperation() { return 0; }
|
||||
|
@ -2,23 +2,21 @@
|
||||
#define WATCHDOG_WATCHDOG_H_
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
class WatchdogTask {
|
||||
public:
|
||||
enum class States {
|
||||
NOT_STARTED,
|
||||
RUNNING,
|
||||
SUSPENDED,
|
||||
FAULTY
|
||||
};
|
||||
public:
|
||||
enum class States { NOT_STARTED, RUNNING, SUSPENDED, FAULTY };
|
||||
|
||||
enum class LoopResult {
|
||||
OK,
|
||||
SUSPEND_RQ,
|
||||
CANCEL_RQ,
|
||||
RESTART_RQ,
|
||||
START_REQ,
|
||||
START_WITH_WATCH_REQ,
|
||||
SUSPEND_REQ,
|
||||
CANCEL_REQ,
|
||||
TIMEOUT,
|
||||
HUNG_UP,
|
||||
FAULT
|
||||
@ -29,18 +27,23 @@ public:
|
||||
virtual ~WatchdogTask();
|
||||
|
||||
int performOperation();
|
||||
private:
|
||||
|
||||
private:
|
||||
int fd = 0;
|
||||
|
||||
bool obswRunning = false;
|
||||
bool watchdogRunning = false;
|
||||
bool watchingObsw = false;
|
||||
bool printNotRunningLatch = false;
|
||||
std::array<uint8_t, 64> buf;
|
||||
std::optional<std::chrono::time_point<std::chrono::system_clock>> notRunningStart;
|
||||
States state = States::NOT_STARTED;
|
||||
|
||||
// Primary loop. Takes care of delaying, and reading from the communication pipe and translating
|
||||
// messages to loop results.
|
||||
LoopResult watchdogLoop();
|
||||
bool stateMachine(LoopResult result);
|
||||
LoopResult pollEvent(struct pollfd& waiter);
|
||||
LoopResult parseCommandByte(ssize_t readLen);
|
||||
LoopResult parseCommand(ssize_t readLen);
|
||||
|
||||
int performRunningOperation();
|
||||
int performNotRunningOperation(LoopResult type);
|
||||
|
@ -5,17 +5,31 @@
|
||||
|
||||
namespace watchdog {
|
||||
|
||||
namespace first {
|
||||
|
||||
// Start or restart character
|
||||
static constexpr char START_CHAR = 'b';
|
||||
// Suspend watchdog operations temporarily
|
||||
static constexpr char SUSPEND_CHAR = 's';
|
||||
// Resume watchdog operations
|
||||
static constexpr char RESTART_CHAR = 'b';
|
||||
// Causes the watchdog to close down
|
||||
static constexpr char CANCEL_CHAR = 'c';
|
||||
static constexpr char IDLE_CHAR = 'i';
|
||||
|
||||
} // namespace first
|
||||
|
||||
namespace second {
|
||||
|
||||
// Supplied with the start character. This will instruct the watchdog to actually watch
|
||||
// the OBSW is runnng all the time.
|
||||
static constexpr char WATCH_FLAG = 'w';
|
||||
} // namespace second
|
||||
|
||||
static constexpr int TIMEOUT_MS = 5 * 1000;
|
||||
// 2 minutes
|
||||
static constexpr unsigned MAX_NOT_RUNNING_MS = 2 * 60 * 1000;
|
||||
const std::string FIFO_NAME = "/tmp/watchdog-pipe";
|
||||
const std::string RUNNING_FILE_NAME = "/tmp/obsw-running";
|
||||
|
||||
}
|
||||
} // namespace watchdog
|
||||
|
||||
#endif /* WATCHDOG_DEFINITIONS_H_ */
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include "Watchdog.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "Watchdog.h"
|
||||
|
||||
/**
|
||||
* @brief This watchdog application uses a FIFO to check whether the OBSW is still running.
|
||||
* It checks whether the OBSW writes to the the FIFO regularly.
|
||||
@ -11,14 +11,12 @@ int main() {
|
||||
try {
|
||||
WatchdogTask watchdogTask;
|
||||
int result = watchdogTask.performOperation();
|
||||
if(result != 0) {
|
||||
if (result != 0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
catch(const std::runtime_error& e) {
|
||||
} catch (const std::runtime_error& e) {
|
||||
std::cerr << "eive-watchdog: Run time exception " << e.what() << std::endl;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user