Update Docker configuration, enhance error handling, and improve logging
- Added health check to the camera management API service in docker-compose.yml for better container reliability. - Updated installation scripts in Dockerfile to check for existing dependencies before installation, improving efficiency. - Enhanced error handling in the USDAVisionSystem class to allow partial operation if some components fail to start, preventing immediate shutdown. - Improved logging throughout the application, including more detailed error messages and critical error handling in the main loop. - Refactored WebSocketManager and CameraMonitor classes to use debug logging for connection events, reducing log noise.
This commit is contained in:
@@ -208,23 +208,50 @@ class USDAVisionSystem:
|
||||
def run(self) -> None:
|
||||
"""Run the system (blocking call)"""
|
||||
if not self.start():
|
||||
self.logger.error("Failed to start system")
|
||||
return
|
||||
self.logger.error("Failed to start system - some components may not be available")
|
||||
# Don't exit immediately - allow partial operation if some components started
|
||||
# Only exit if critical components failed
|
||||
if not self.running:
|
||||
self.logger.critical("Critical components failed to start - exiting")
|
||||
return
|
||||
|
||||
try:
|
||||
self.logger.info("System running... Press Ctrl+C to stop")
|
||||
|
||||
# Main loop - just keep the system alive
|
||||
consecutive_errors = 0
|
||||
max_consecutive_errors = 10
|
||||
|
||||
while self.running:
|
||||
time.sleep(1)
|
||||
try:
|
||||
time.sleep(1)
|
||||
consecutive_errors = 0 # Reset on successful iteration
|
||||
|
||||
# Periodic maintenance tasks could go here
|
||||
# For example: cleanup old recordings, health checks, etc.
|
||||
# Periodic maintenance tasks could go here
|
||||
# For example: cleanup old recordings, health checks, etc.
|
||||
|
||||
# Health check: verify critical components are still running
|
||||
if not self.mqtt_client.is_running():
|
||||
self.logger.warning("MQTT client stopped running - attempting restart")
|
||||
try:
|
||||
self.mqtt_client.start()
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to restart MQTT client: {e}")
|
||||
consecutive_errors += 1
|
||||
|
||||
except Exception as e:
|
||||
consecutive_errors += 1
|
||||
self.logger.error(f"Error in main loop (consecutive: {consecutive_errors}): {e}", exc_info=True)
|
||||
|
||||
# If too many consecutive errors, exit to prevent infinite crash loop
|
||||
if consecutive_errors >= max_consecutive_errors:
|
||||
self.logger.critical(f"Too many consecutive errors ({consecutive_errors}) - shutting down to prevent crash loop")
|
||||
break
|
||||
|
||||
except KeyboardInterrupt:
|
||||
self.logger.info("Keyboard interrupt received")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Unexpected error in main loop: {e}")
|
||||
self.logger.error(f"Unexpected error in main loop: {e}", exc_info=True)
|
||||
finally:
|
||||
self.stop()
|
||||
|
||||
@@ -270,8 +297,14 @@ def main():
|
||||
|
||||
try:
|
||||
system.run()
|
||||
except KeyboardInterrupt:
|
||||
logging.info("Interrupted by user")
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logging.error(f"Fatal error: {e}")
|
||||
logging.critical(f"Fatal error: {e}", exc_info=True)
|
||||
# Give a moment for logs to flush
|
||||
import time
|
||||
time.sleep(1)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user