reworked watchdog
This commit is contained in:
@ -1,258 +1,276 @@
|
||||
#include "Watchdog.h"
|
||||
#include "definitions.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <poll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <poll.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
|
||||
#include "definitions.h"
|
||||
|
||||
WatchdogTask::WatchdogTask (): fd(0) {
|
||||
int result = 0;
|
||||
// Only create the FIFO if it does not exist yet
|
||||
if(not std::filesystem::exists(watchdog::FIFO_NAME)) {
|
||||
// Permission 666 or rw-rw-rw-
|
||||
mode_t mode = DEFFILEMODE;
|
||||
result = mkfifo(watchdog::FIFO_NAME.c_str(), mode);
|
||||
if(result != 0) {
|
||||
std::cerr << "eive-watchdog: Could not created named pipe at " <<
|
||||
watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) <<
|
||||
std::endl;
|
||||
throw std::runtime_error("eive-watchdog: FIFO creation failed");
|
||||
}
|
||||
#if WATCHDOG_VERBOSE_LEVEL >= 1
|
||||
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME <<
|
||||
" created successfully" << std::endl;
|
||||
#endif
|
||||
WatchdogTask::WatchdogTask() : fd(0) {
|
||||
int result = 0;
|
||||
// Only create the FIFO if it does not exist yet
|
||||
if (not std::filesystem::exists(watchdog::FIFO_NAME)) {
|
||||
// Permission 666 or rw-rw-rw-
|
||||
mode_t mode = DEFFILEMODE;
|
||||
result = mkfifo(watchdog::FIFO_NAME.c_str(), mode);
|
||||
if (result != 0) {
|
||||
std::cerr << "eive-watchdog: Could not created named pipe at " << watchdog::FIFO_NAME
|
||||
<< ", error " << errno << ": " << strerror(errno) << std::endl;
|
||||
throw std::runtime_error("eive-watchdog: FIFO creation failed");
|
||||
}
|
||||
#if WATCHDOG_VERBOSE_LEVEL >= 1
|
||||
std::cout << "eive-watchdog: Pipe at " << watchdog::FIFO_NAME << " created successfully"
|
||||
<< std::endl;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
WatchdogTask::~WatchdogTask() {
|
||||
|
||||
}
|
||||
WatchdogTask::~WatchdogTask() {}
|
||||
|
||||
int WatchdogTask::performOperation() {
|
||||
// Open FIFO read only and non-blocking
|
||||
fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK);
|
||||
if(fd < 0) {
|
||||
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME <<
|
||||
"read-only failed with " << errno << ": " << strerror(errno) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
state = States::RUNNING;
|
||||
// Open FIFO read only and non-blocking
|
||||
fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK);
|
||||
if (fd < 0) {
|
||||
std::cerr << "eive-watchdog: Opening pipe " << watchdog::FIFO_NAME << "read-only failed with "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
state = States::NOT_STARTED;
|
||||
|
||||
while(true) {
|
||||
WatchdogTask::LoopResult loopResult = watchdogLoop();
|
||||
switch(loopResult) {
|
||||
case(LoopResult::OK): {
|
||||
performRunningOperation();
|
||||
break;
|
||||
}
|
||||
case(LoopResult::CANCEL_RQ): {
|
||||
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
case(LoopResult::SUSPEND_RQ): {
|
||||
performSuspendOperation();
|
||||
break;
|
||||
}
|
||||
case(LoopResult::TIMEOUT): {
|
||||
performNotRunningOperation(loopResult);
|
||||
break;
|
||||
}
|
||||
case(LoopResult::HUNG_UP): {
|
||||
performNotRunningOperation(loopResult);
|
||||
break;
|
||||
}
|
||||
case(LoopResult::RESTART_RQ): {
|
||||
if(state == States::SUSPENDED or state == States::FAULTY) {
|
||||
performRunningOperation();
|
||||
}
|
||||
break;
|
||||
}
|
||||
case(LoopResult::FAULT): {
|
||||
using namespace std::chrono_literals;
|
||||
// Configuration error
|
||||
std::cerr << "Fault has occured in watchdog loop" << std::endl;
|
||||
// Prevent spam
|
||||
std::this_thread::sleep_for(2000ms);
|
||||
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
WatchdogTask::LoopResult loopResult = watchdogLoop();
|
||||
if (not stateMachine(loopResult)) {
|
||||
break;
|
||||
}
|
||||
if (close(fd) < 0) {
|
||||
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME <<
|
||||
"failed, error " << errno << ": " << strerror(errno) << std::endl;
|
||||
}
|
||||
std::cout << "eive-watchdog: Finished" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
if (close(fd) < 0) {
|
||||
std::cerr << "eive-watchdog: Closing named pipe at " << watchdog::FIFO_NAME << "failed, error "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
}
|
||||
std::cout << "eive-watchdog: Finished" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
WatchdogTask::LoopResult WatchdogTask::watchdogLoop() {
|
||||
using namespace std::chrono_literals;
|
||||
struct pollfd waiter = {};
|
||||
waiter.fd = fd;
|
||||
waiter.events = POLLIN;
|
||||
using namespace std::chrono_literals;
|
||||
struct pollfd waiter = {};
|
||||
waiter.fd = fd;
|
||||
waiter.events = POLLIN;
|
||||
|
||||
switch(state) {
|
||||
case(States::SUSPENDED): {
|
||||
// Sleep, then check whether a restart request was received
|
||||
std::this_thread::sleep_for(1000ms);
|
||||
break;
|
||||
// Only poll one file descriptor with timeout
|
||||
switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
|
||||
case (0): {
|
||||
return LoopResult::TIMEOUT;
|
||||
}
|
||||
case(States::RUNNING): {
|
||||
// Continue as usual
|
||||
break;
|
||||
}
|
||||
case(States::NOT_STARTED): {
|
||||
// This should not happen
|
||||
std::cerr << "eive-watchdog: State is NOT_STARTED, configuration error" << std::endl;
|
||||
break;
|
||||
}
|
||||
case(States::FAULTY): {
|
||||
// TODO: Not sure what to do yet. Continue for now
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 10 seconds timeout, only poll one file descriptor
|
||||
switch(poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
|
||||
case(0): {
|
||||
return LoopResult::TIMEOUT;
|
||||
}
|
||||
case(1): {
|
||||
return pollEvent(waiter);
|
||||
case (1): {
|
||||
return pollEvent(waiter);
|
||||
}
|
||||
default: {
|
||||
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error " <<
|
||||
errno << ": " << strerror(errno) << std::endl;
|
||||
break;
|
||||
std::cerr << "eive-watchdog: Unknown poll error at " << watchdog::FIFO_NAME << ", error "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return LoopResult::OK;
|
||||
}
|
||||
return LoopResult::OK;
|
||||
}
|
||||
|
||||
WatchdogTask::LoopResult WatchdogTask::pollEvent(struct pollfd& waiter) {
|
||||
if (waiter.revents & POLLIN) {
|
||||
ssize_t readLen = read(fd, buf.data(), buf.size());
|
||||
if (readLen < 0) {
|
||||
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME <<
|
||||
", error " << errno << ": " << strerror(errno) << std::endl;
|
||||
return LoopResult::OK;
|
||||
}
|
||||
if (waiter.revents & POLLIN) {
|
||||
ssize_t readLen = read(fd, buf.data(), buf.size());
|
||||
if (readLen < 0) {
|
||||
std::cerr << "eive-watchdog: Read error on pipe " << watchdog::FIFO_NAME << ", error "
|
||||
<< errno << ": " << strerror(errno) << std::endl;
|
||||
return LoopResult::OK;
|
||||
}
|
||||
#if WATCHDOG_VERBOSE_LEVEL == 2
|
||||
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME
|
||||
<< std::endl;
|
||||
std::cout << "Read " << readLen << " byte(s) on the pipe " << FIFO_NAME << std::endl;
|
||||
#endif
|
||||
else if(readLen >= 1) {
|
||||
return parseCommandByte(readLen);
|
||||
}
|
||||
else if (readLen >= 1) {
|
||||
return parseCommand(readLen);
|
||||
}
|
||||
|
||||
}
|
||||
else if(waiter.revents & POLLERR) {
|
||||
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME <<
|
||||
std::endl;
|
||||
return LoopResult::FAULT;
|
||||
}
|
||||
else if (waiter.revents & POLLHUP) {
|
||||
// Writer closed its end
|
||||
return LoopResult::HUNG_UP;
|
||||
}
|
||||
} else if (waiter.revents & POLLERR) {
|
||||
std::cerr << "eive-watchdog: Poll error error on pipe " << watchdog::FIFO_NAME << std::endl;
|
||||
return LoopResult::FAULT;
|
||||
} else if (waiter.revents & POLLHUP) {
|
||||
// Writer closed its end
|
||||
return LoopResult::HUNG_UP;
|
||||
}
|
||||
return LoopResult::FAULT;
|
||||
}
|
||||
|
||||
WatchdogTask::LoopResult WatchdogTask::parseCommandByte(ssize_t readLen) {
|
||||
for(ssize_t idx = 0; idx < readLen; idx++) {
|
||||
char readChar = buf[idx];
|
||||
// Cancel request
|
||||
if(readChar == watchdog::CANCEL_CHAR) {
|
||||
return LoopResult::CANCEL_RQ;
|
||||
}
|
||||
// Begin request. Does not work if the operation was not suspended before
|
||||
else if(readChar == watchdog::RESTART_CHAR) {
|
||||
return LoopResult::RESTART_RQ;
|
||||
}
|
||||
// Suspend request
|
||||
else if(readChar == watchdog::SUSPEND_CHAR) {
|
||||
return LoopResult::SUSPEND_RQ;
|
||||
}
|
||||
// Everything else: All working as expected
|
||||
WatchdogTask::LoopResult WatchdogTask::parseCommand(ssize_t readLen) {
|
||||
char readChar = buf[0];
|
||||
// Cancel request
|
||||
if (readChar == watchdog::first::CANCEL_CHAR) {
|
||||
return LoopResult::CANCEL_REQ;
|
||||
} else if (readChar == watchdog::first::SUSPEND_CHAR) {
|
||||
// Suspend request
|
||||
return LoopResult::SUSPEND_REQ;
|
||||
} else if (readChar == watchdog::first::START_CHAR) {
|
||||
if (readLen == 2 and static_cast<char>(buf[1]) == watchdog::second::WATCH_FLAG) {
|
||||
return LoopResult::START_WITH_WATCH_REQ;
|
||||
}
|
||||
return LoopResult::OK;
|
||||
return LoopResult::START_REQ;
|
||||
}
|
||||
// Everything else: All working as expected
|
||||
return LoopResult::OK;
|
||||
}
|
||||
|
||||
int WatchdogTask::performRunningOperation() {
|
||||
if(state != States::RUNNING) {
|
||||
state = States::RUNNING;
|
||||
if (state != States::RUNNING) {
|
||||
state = States::RUNNING;
|
||||
}
|
||||
if (notRunningStart.has_value()) {
|
||||
notRunningStart = std::nullopt;
|
||||
}
|
||||
|
||||
if (not obswRunning) {
|
||||
if (printNotRunningLatch) {
|
||||
// Reset latch so user can see timeouts
|
||||
printNotRunningLatch = false;
|
||||
}
|
||||
|
||||
if(not obswRunning) {
|
||||
if(printNotRunningLatch) {
|
||||
// Reset latch so user can see timeouts
|
||||
printNotRunningLatch = false;
|
||||
}
|
||||
|
||||
obswRunning = true;
|
||||
std::cout << "eive-watchdog: Running OBSW detected.." << std::endl;
|
||||
obswRunning = true;
|
||||
std::cout << "eive-watchdog: Running OBSW detected.." << std::endl;
|
||||
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
|
||||
std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl;
|
||||
if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
|
||||
std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME);
|
||||
if(not obswRunningFile.good()) {
|
||||
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
std::cout << "eive-watchdog: Creating " << watchdog::RUNNING_FILE_NAME << std::endl;
|
||||
if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
|
||||
std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME);
|
||||
if (not obswRunningFile.good()) {
|
||||
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WatchdogTask::performNotRunningOperation(LoopResult type) {
|
||||
// Latch prevents spam on console
|
||||
if(not printNotRunningLatch) {
|
||||
if(type == LoopResult::HUNG_UP) {
|
||||
std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cout << "eive-watchdog: The FIFO timed out!" << std::endl;
|
||||
}
|
||||
printNotRunningLatch = true;
|
||||
// Latch prevents spam on console
|
||||
if (not printNotRunningLatch) {
|
||||
if (type == LoopResult::HUNG_UP) {
|
||||
std::cout << "eive-watchdog: FIFO writer hung up!" << std::endl;
|
||||
} else {
|
||||
std::cout << "eive-watchdog: The FIFO timed out!" << std::endl;
|
||||
}
|
||||
printNotRunningLatch = true;
|
||||
}
|
||||
|
||||
if(obswRunning) {
|
||||
if (not notRunningStart.has_value()) {
|
||||
notRunningStart = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
if (obswRunning) {
|
||||
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
|
||||
if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
|
||||
int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str());
|
||||
if(result != 0) {
|
||||
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " <<
|
||||
errno << ": " << strerror(errno) << std::endl;
|
||||
}
|
||||
}
|
||||
if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME)) {
|
||||
int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str());
|
||||
if (result != 0) {
|
||||
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno
|
||||
<< ": " << strerror(errno) << std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
obswRunning = false;
|
||||
obswRunning = false;
|
||||
}
|
||||
|
||||
if (watchingObsw) {
|
||||
auto timeNotRunning = std::chrono::system_clock::now() - notRunningStart.value();
|
||||
if (std::chrono::duration_cast<std::chrono::milliseconds>(timeNotRunning).count() >
|
||||
watchdog::MAX_NOT_RUNNING_MS) {
|
||||
std::cout << "Restarting OBSW" << std::endl;
|
||||
std::system("systemctl restart obsw");
|
||||
}
|
||||
if(type == LoopResult::HUNG_UP) {
|
||||
using namespace std::chrono_literals;
|
||||
// Prevent spam
|
||||
std::this_thread::sleep_for(2000ms);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (type == LoopResult::HUNG_UP) {
|
||||
using namespace std::chrono_literals;
|
||||
// Prevent spam
|
||||
std::this_thread::sleep_for(2000ms);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WatchdogTask::performSuspendOperation() {
|
||||
if(state == States::RUNNING or state == States::FAULTY) {
|
||||
std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl;
|
||||
watchdogRunning = false;
|
||||
state = States::SUSPENDED;
|
||||
bool WatchdogTask::stateMachine(LoopResult loopResult) {
|
||||
using namespace std::chrono_literals;
|
||||
bool sleep = false;
|
||||
switch (state) {
|
||||
case (States::RUNNING): {
|
||||
switch (loopResult) {
|
||||
case (LoopResult::TIMEOUT):
|
||||
case (LoopResult::HUNG_UP): {
|
||||
performNotRunningOperation(loopResult);
|
||||
break;
|
||||
}
|
||||
case (LoopResult::OK): {
|
||||
performRunningOperation();
|
||||
break;
|
||||
}
|
||||
case (LoopResult::SUSPEND_REQ): {
|
||||
if (state == States::RUNNING or state == States::FAULTY) {
|
||||
std::cout << "eive-watchdog: Suspending watchdog operations" << std::endl;
|
||||
state = States::SUSPENDED;
|
||||
}
|
||||
performSuspendOperation();
|
||||
sleep = true;
|
||||
break;
|
||||
}
|
||||
case (LoopResult::CANCEL_REQ): {
|
||||
std::cout << "eive-watchdog: Received cancel request, closing watchdog.." << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
case (States::FAULTY):
|
||||
case (States::SUSPENDED):
|
||||
case (States::NOT_STARTED): {
|
||||
switch (loopResult) {
|
||||
case (LoopResult::SUSPEND_REQ): {
|
||||
// Ignore and also delay
|
||||
sleep = true;
|
||||
break;
|
||||
}
|
||||
case (LoopResult::START_REQ):
|
||||
case (LoopResult::START_WITH_WATCH_REQ): {
|
||||
if (state == States::NOT_STARTED or state == States::FAULTY) {
|
||||
state = States::RUNNING;
|
||||
}
|
||||
std::cout << "Watch request received. Restarting OBSW if not running for "
|
||||
<< watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl;
|
||||
if (loopResult == LoopResult::START_REQ) {
|
||||
watchingObsw = false;
|
||||
} else if (loopResult == LoopResult::START_WITH_WATCH_REQ) {
|
||||
watchingObsw = true;
|
||||
}
|
||||
performRunningOperation();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
sleep = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (loopResult == LoopResult::FAULT) {
|
||||
// Configuration error
|
||||
std::cerr << "Fault has occured in watchdog loop" << std::endl;
|
||||
// Prevent spam
|
||||
sleep = true;
|
||||
}
|
||||
if (sleep) {
|
||||
std::this_thread::sleep_for(1000ms);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int WatchdogTask::performSuspendOperation() { return 0; }
|
||||
|
Reference in New Issue
Block a user