#include "Watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "definitions.h" WatchdogTask::WatchdogTask() : fd(0) { int result = 0; std::error_code e; // Only create the FIFO if it does not exist yet if (not std::filesystem::exists(watchdog::FIFO_NAME, e)) { // Permission 666 or rw-rw-rw- mode_t mode = DEFFILEMODE; result = mkfifo(watchdog::FIFO_NAME.c_str(), mode); if (result != 0) { std::cerr << "Could not created named pipe at " << watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) << std::endl; throw std::runtime_error("eive-watchdog: FIFO creation failed"); } #if WATCHDOG_VERBOSE_LEVEL >= 1 std::cout << "Pipe at " << watchdog::FIFO_NAME << " created successfully" << std::endl; #endif } } WatchdogTask::~WatchdogTask() {} int WatchdogTask::performOperation() { // Open FIFO read only and non-blocking fd = open(watchdog::FIFO_NAME.c_str(), O_RDONLY | O_NONBLOCK); if (fd < 0) { std::cerr << "Opening pipe " << watchdog::FIFO_NAME << "read-only failed with " << errno << ": " << strerror(errno) << std::endl; return -1; } // Clear FIFO by reading until it is empty. while (true) { ssize_t readBytes = read(fd, buf.data(), buf.size()); if (readBytes < 0) { std::cerr << "Read error of FIFO: " << strerror(errno) << std::endl; } else if (readBytes == 0) { break; } } state = States::NOT_STARTED; bool breakOuter = false; while (true) { watchdogLoop(); while (not resultQueue.empty()) { auto nextRequest = resultQueue.front(); if (not stateMachine(nextRequest)) { breakOuter = true; resultQueue.pop(); break; } resultQueue.pop(); } if (breakOuter) { break; } } if (close(fd) < 0) { std::cerr << "Closing named pipe at " << watchdog::FIFO_NAME << "failed, error " << errno << ": " << strerror(errno) << std::endl; } std::cout << "Closing" << std::endl; return 0; } void WatchdogTask::watchdogLoop() { using namespace std::chrono_literals; struct pollfd waiter = {}; waiter.fd = fd; waiter.events = POLLIN; // Only poll one file descriptor with timeout switch (poll(&waiter, 1, watchdog::TIMEOUT_MS)) { case (0): { resultQueue.push(LoopResult::TIMEOUT); return; } case (1): { pollEvent(waiter); return; } default: { std::cerr << "Unknown poll error at " << watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) << std::endl; break; } } } void WatchdogTask::pollEvent(struct pollfd& waiter) { if (waiter.revents & POLLIN) { ssize_t readLen = read(fd, buf.data(), buf.size()); #if WATCHDOG_VERBOSE_LEVEL == 2 std::cout << "Read " << readLen << " byte(s) on the pipe " << watchdog::FIFO_NAME << std::endl; #endif if (readLen < 0) { std::cerr << "Read error on pipe " << watchdog::FIFO_NAME << ", error " << errno << ": " << strerror(errno) << std::endl; resultQueue.push(LoopResult::OK); } else if (readLen >= 1) { parseCommands(readLen); } } else if (waiter.revents & POLLERR) { std::cerr << "Poll error error on pipe " << watchdog::FIFO_NAME << std::endl; resultQueue.push(LoopResult::FAULT); } else if (waiter.revents & POLLHUP) { // Writer closed its end resultQueue.push(LoopResult::HUNG_UP); } } void WatchdogTask::parseCommands(ssize_t readLen) { for (ssize_t idx = 0; idx < readLen; idx++) { char nextChar = buf[idx]; // Cancel request if (nextChar == watchdog::first::CANCEL_CHAR) { resultQueue.push(LoopResult::CANCEL_REQ); } else if (nextChar == watchdog::first::SUSPEND_CHAR) { // Suspend request resultQueue.push(LoopResult::SUSPEND_REQ); } else if (nextChar == watchdog::first::START_CHAR) { if (idx < readLen - 1 and static_cast(buf[idx + 1]) == watchdog::second::WATCH_FLAG) { resultQueue.push(LoopResult::START_WITH_WATCH_REQ); idx++; continue; } resultQueue.push(LoopResult::START_REQ); } else if (nextChar == watchdog::first::IDLE_CHAR) { resultQueue.push(LoopResult::OK); } } // Everything else: All working as expected } int WatchdogTask::performRunningOperation() { if (state != States::RUNNING) { state = States::RUNNING; } if (notRunningStart.has_value()) { notRunningStart = std::nullopt; } if (not obswRunning) { if (printNotRunningLatch) { // Reset latch so user can see timeouts printNotRunningLatch = false; } obswRunning = true; std::cout << "OBSW is running" << std::endl; #if WATCHDOG_CREATE_FILE_IF_RUNNING == 1 std::cout << "Creating " << watchdog::RUNNING_FILE_NAME << std::endl; std::error_code e; if (not std::filesystem::exists(watchdog::RUNNING_FILE_NAME, e)) { std::ofstream obswRunningFile(watchdog::RUNNING_FILE_NAME); if (not obswRunningFile.good()) { std::cerr << "Creating file " << watchdog::RUNNING_FILE_NAME << " failed" << std::endl; } } #endif } return 0; } int WatchdogTask::performNotRunningOperation(LoopResult type) { // Latch prevents spam on console if (not printNotRunningLatch) { if (type == LoopResult::HUNG_UP) { std::cout << "OBSW hung up" << std::endl; } else { std::cout << "The FIFO timed out, OBSW might not be running" << std::endl; } printNotRunningLatch = true; } if (not notRunningStart.has_value()) { notRunningStart = std::chrono::steady_clock::now(); } if (obswRunning) { #if WATCHDOG_CREATE_FILE_IF_RUNNING == 1 std::cout << "Removing " << watchdog::RUNNING_FILE_NAME << std::endl; std::error_code e; if (std::filesystem::exists(watchdog::RUNNING_FILE_NAME, e)) { int result = std::remove(watchdog::RUNNING_FILE_NAME.c_str()); if (result != 0) { std::cerr << "Removing " << watchdog::RUNNING_FILE_NAME << " failed with code " << errno << ": " << strerror(errno) << std::endl; } } #endif obswRunning = false; } if (watchingObsw) { auto timeNotRunning = std::chrono::steady_clock::now() - notRunningStart.value(); if (std::chrono::duration_cast(timeNotRunning).count() > watchdog::MAX_NOT_RUNNING_MS) { std::cout << "Restarting OBSW with systemctl" << std::endl; std::system("systemctl restart obsw"); } } if (type == LoopResult::HUNG_UP) { using namespace std::chrono_literals; // Prevent spam std::this_thread::sleep_for(2000ms); } return 0; } bool WatchdogTask::stateMachine(LoopResult loopResult) { using namespace std::chrono_literals; bool sleep = false; switch (state) { case (States::RUNNING): { switch (loopResult) { case (LoopResult::TIMEOUT): case (LoopResult::HUNG_UP): { performNotRunningOperation(loopResult); break; } case (LoopResult::OK): { performRunningOperation(); break; } case (LoopResult::SUSPEND_REQ): { if (state == States::RUNNING or state == States::FAULTY) { std::cout << "Received suspend request, suspending watchdog operations" << std::endl; state = States::SUSPENDED; } performSuspendOperation(); sleep = true; break; } case (LoopResult::CANCEL_REQ): { std::cout << "Received cancel request, closing watchdog.." << std::endl; return false; } } } case (States::FAULTY): case (States::SUSPENDED): case (States::NOT_STARTED): { switch (loopResult) { case (LoopResult::SUSPEND_REQ): { // Ignore and also delay sleep = true; break; } case (LoopResult::START_REQ): case (LoopResult::START_WITH_WATCH_REQ): { if (state == States::NOT_STARTED or state == States::FAULTY) { state = States::RUNNING; } if (loopResult == LoopResult::START_REQ) { std::cout << "Start request without watch request received" << std::endl; watchingObsw = false; } else if (loopResult == LoopResult::START_WITH_WATCH_REQ) { std::cout << "Start request with watch request received. Restarting OBSW if not " "running for " << watchdog::MAX_NOT_RUNNING_MS / 1000 << " seconds" << std::endl; watchingObsw = true; } performRunningOperation(); break; } default: { sleep = true; } } break; } } if (loopResult == LoopResult::FAULT) { // Configuration error std::cerr << "Fault has occured in watchdog loop" << std::endl; // Prevent spam sleep = true; } if (sleep) { std::this_thread::sleep_for(500ms); } return true; } int WatchdogTask::performSuspendOperation() { return 0; }