305 lines
9.1 KiB
C++
305 lines
9.1 KiB
C++
#include "Watchdog.h"
|
|
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <poll.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/types.h>
|
|
#include <unistd.h>
|
|
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <thread>
|
|
|
|
#include "definitions.h"
|
|
|
|
WatchdogTask::WatchdogTask() : fd(0) {
|
|
int result = 0;
|
|
std::error_code e;
|
|
// Only create the FIFO if it does not exist yet
|
|
if (not std::filesystem::exists(watchdog::FIFO_NAME, e)) {
|
|
// Permission 666 or rw-rw-rw-
|
|
mode_t mode = DEFFILEMODE;
|
|
result = mkfifo(watchdog::FIFO_NAME.c_str(), mode);
|
|
if (result != 0) {
|
|
std::cerr << "Could not created named pipe at " << watchdog::FIFO_NAME << ", error " << errno
|
|
<< ": " << strerror(errno) << std::endl;
|
|
throw std::runtime_error("eive-watchdog: FIFO creation failed");
|
|
}
|
|
#if WATCHDOG_VERBOSE_LEVEL >= 1
|
|
std::cout << "Pipe at " << watchdog::FIFO_NAME << " created successfully" << std::endl;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
WatchdogTask::~WatchdogTask() {}
|
|
|
|
int WatchdogTask::performOperation() {
|
|
// Open FIFO read only and non-blocking
|
|
fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK);
|
|
if (fd < 0) {
|
|
std::cerr << "Opening pipe " << watchdog::FIFO_NAME << "read-only failed with " << errno << ": "
|
|
<< strerror(errno) << std::endl;
|
|
return -1;
|
|
}
|
|
// Clear FIFO by reading until it is empty.
|
|
while (true) {
|
|
ssize_t readBytes = read(fd, buf.data(), buf.size());
|
|
if (readBytes < 0) {
|
|
std::cerr << "Read error of FIFO: " << strerror(errno) << std::endl;
|
|
} else if (readBytes == 0) {
|
|
break;
|
|
}
|
|
}
|
|
state = States::NOT_STARTED;
|
|
|
|
bool breakOuter = false;
|
|
while (true) {
|
|
watchdogLoop();
|
|
while (not resultQueue.empty()) {
|
|
auto nextRequest = resultQueue.front();
|
|
if (not stateMachine(nextRequest)) {
|
|
breakOuter = true;
|
|
resultQueue.pop();
|
|
break;
|
|
}
|
|
resultQueue.pop();
|
|
}
|
|
if (breakOuter) {
|
|
break;
|
|
}
|
|
}
|
|
if (close(fd) < 0) {
|
|
std::cerr << "Closing named pipe at " << watchdog::FIFO_NAME << "failed, error " << errno
|
|
<< ": " << strerror(errno) << std::endl;
|
|
}
|
|
std::cout << "Closing" << std::endl;
|
|
return 0;
|
|
}
|
|
|
|
void WatchdogTask::watchdogLoop() {
|
|
using namespace std::chrono_literals;
|
|
struct pollfd waiter = {};
|
|
waiter.fd = fd;
|
|
waiter.events = POLLIN;
|
|
|
|
// Only poll one file descriptor with timeout
|
|
switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) {
|
|
case (0): {
|
|
resultQueue.push(LoopResult::TIMEOUT);
|
|
return;
|
|
}
|
|
case (1): {
|
|
pollEvent(waiter);
|
|
return;
|
|
}
|
|
default: {
|
|
std::cerr << "Unknown poll error at " << watchdog::FIFO_NAME << ", error " << errno << ": "
|
|
<< strerror(errno) << std::endl;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void WatchdogTask::pollEvent(struct pollfd& waiter) {
|
|
if (waiter.revents & POLLIN) {
|
|
ssize_t readLen = read(fd, buf.data(), buf.size());
|
|
#if WATCHDOG_VERBOSE_LEVEL == 2
|
|
std::cout << "Read " << readLen << " byte(s) on the pipe " << watchdog::FIFO_NAME << std::endl;
|
|
#endif
|
|
if (readLen < 0) {
|
|
std::cerr << "Read error on pipe " << watchdog::FIFO_NAME << ", error " << errno << ": "
|
|
<< strerror(errno) << std::endl;
|
|
resultQueue.push(LoopResult::OK);
|
|
} else if (readLen >= 1) {
|
|
parseCommands(readLen);
|
|
}
|
|
|
|
} else if (waiter.revents & POLLERR) {
|
|
std::cerr << "Poll error error on pipe " << watchdog::FIFO_NAME << std::endl;
|
|
resultQueue.push(LoopResult::FAULT);
|
|
} else if (waiter.revents & POLLHUP) {
|
|
// Writer closed its end
|
|
resultQueue.push(LoopResult::HUNG_UP);
|
|
}
|
|
}
|
|
|
|
void WatchdogTask::parseCommands(ssize_t readLen) {
|
|
for (ssize_t idx = 0; idx < readLen; idx++) {
|
|
char nextChar = buf[idx];
|
|
// Cancel request
|
|
if (nextChar == watchdog::first::CANCEL_CHAR) {
|
|
resultQueue.push(LoopResult::CANCEL_REQ);
|
|
} else if (nextChar == watchdog::first::SUSPEND_CHAR) {
|
|
// Suspend request
|
|
resultQueue.push(LoopResult::SUSPEND_REQ);
|
|
} else if (nextChar == watchdog::first::START_CHAR) {
|
|
if (idx < readLen - 1 and static_cast<char>(buf[idx + 1]) == watchdog::second::WATCH_FLAG) {
|
|
resultQueue.push(LoopResult::START_WITH_WATCH_REQ);
|
|
idx++;
|
|
continue;
|
|
}
|
|
resultQueue.push(LoopResult::START_REQ);
|
|
} else if (nextChar == watchdog::first::IDLE_CHAR) {
|
|
resultQueue.push(LoopResult::OK);
|
|
}
|
|
}
|
|
// Everything else: All working as expected
|
|
}
|
|
|
|
int WatchdogTask::performRunningOperation() {
|
|
if (state != States::RUNNING) {
|
|
state = States::RUNNING;
|
|
}
|
|
if (notRunningStart.has_value()) {
|
|
notRunningStart = std::nullopt;
|
|
}
|
|
|
|
if (not obswRunning) {
|
|
if (printNotRunningLatch) {
|
|
// Reset latch so user can see timeouts
|
|
printNotRunningLatch = false;
|
|
}
|
|
|
|
obswRunning = true;
|
|
std::cout << "OBSW is running" << std::endl;
|
|
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
|
|
std::cout << "Creating " << watchdog::RUNNING_FILE_NAME << std::endl;
|
|
std::error_code e;
|
|
if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME, e)) {
|
|
std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME);
|
|
if (not obswRunningFile.good()) {
|
|
std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int WatchdogTask::performNotRunningOperation(LoopResult type) {
|
|
// Latch prevents spam on console
|
|
if (not printNotRunningLatch) {
|
|
if (type == LoopResult::HUNG_UP) {
|
|
std::cout << "OBSW hung up" << std::endl;
|
|
} else {
|
|
std::cout << "The FIFO timed out, OBSW might not be running" << std::endl;
|
|
}
|
|
printNotRunningLatch = true;
|
|
}
|
|
|
|
if (not notRunningStart.has_value()) {
|
|
notRunningStart = std::chrono::steady_clock::now();
|
|
}
|
|
|
|
if (obswRunning) {
|
|
#if WATCHDOG_CREATE_FILE_IF_RUNNING == 1
|
|
std::cout << "Removing " << watchdog::RUNNING_FILE_NAME << std::endl;
|
|
std::error_code e;
|
|
if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME, e)) {
|
|
int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str());
|
|
if (result != 0) {
|
|
std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno
|
|
<< ": " << strerror(errno) << std::endl;
|
|
}
|
|
}
|
|
#endif
|
|
obswRunning = false;
|
|
}
|
|
|
|
if (watchingObsw) {
|
|
auto timeNotRunning = std::chrono::steady_clock::now() - notRunningStart.value();
|
|
if (std::chrono::duration_cast<std::chrono::milliseconds>(timeNotRunning).count() >
|
|
watchdog::MAX_NOT_RUNNING_MS) {
|
|
std::cout << "Restarting OBSW with systemctl" << std::endl;
|
|
std::system("systemctl restart obsw");
|
|
}
|
|
}
|
|
if (type == LoopResult::HUNG_UP) {
|
|
using namespace std::chrono_literals;
|
|
// Prevent spam
|
|
std::this_thread::sleep_for(2000ms);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool WatchdogTask::stateMachine(LoopResult loopResult) {
|
|
using namespace std::chrono_literals;
|
|
bool sleep = false;
|
|
switch (state) {
|
|
case (States::RUNNING): {
|
|
switch (loopResult) {
|
|
case (LoopResult::TIMEOUT):
|
|
case (LoopResult::HUNG_UP): {
|
|
performNotRunningOperation(loopResult);
|
|
break;
|
|
}
|
|
case (LoopResult::OK): {
|
|
performRunningOperation();
|
|
break;
|
|
}
|
|
case (LoopResult::SUSPEND_REQ): {
|
|
if (state == States::RUNNING or state == States::FAULTY) {
|
|
std::cout << "Received suspend request, suspending watchdog operations" << std::endl;
|
|
state = States::SUSPENDED;
|
|
}
|
|
performSuspendOperation();
|
|
sleep = true;
|
|
break;
|
|
}
|
|
case (LoopResult::CANCEL_REQ): {
|
|
std::cout << "Received cancel request, closing watchdog.." << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
case (States::FAULTY):
|
|
case (States::SUSPENDED):
|
|
case (States::NOT_STARTED): {
|
|
switch (loopResult) {
|
|
case (LoopResult::SUSPEND_REQ): {
|
|
// Ignore and also delay
|
|
sleep = true;
|
|
break;
|
|
}
|
|
case (LoopResult::START_REQ):
|
|
case (LoopResult::START_WITH_WATCH_REQ): {
|
|
if (state == States::NOT_STARTED or state == States::FAULTY) {
|
|
state = States::RUNNING;
|
|
}
|
|
if (loopResult == LoopResult::START_REQ) {
|
|
std::cout << "Start request without watch request received" << std::endl;
|
|
watchingObsw = false;
|
|
} else if (loopResult == LoopResult::START_WITH_WATCH_REQ) {
|
|
std::cout << "Start request with watch request received. Restarting OBSW if not "
|
|
"running for "
|
|
<< watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl;
|
|
watchingObsw = true;
|
|
}
|
|
performRunningOperation();
|
|
break;
|
|
}
|
|
default: {
|
|
sleep = true;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (loopResult == LoopResult::FAULT) {
|
|
// Configuration error
|
|
std::cerr << "Fault has occured in watchdog loop" << std::endl;
|
|
// Prevent spam
|
|
sleep = true;
|
|
}
|
|
if (sleep) {
|
|
std::this_thread::sleep_for(500ms);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int WatchdogTask::performSuspendOperation() { return 0; }
|