pacemaker  1.1.19-c3c624ea3d
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013 Lars Marowsky-Bree <lmb@suse.com>
3  * 2014-2018 Andrew Beekhof <andrew@beekhof.net>
4  *
5  * This source code is licensed under the GNU Lesser General Public License
6  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
7  */
8 
9 #include <crm_internal.h>
10 
11 #include <sched.h>
12 #include <sys/ioctl.h>
13 #include <sys/reboot.h>
14 
15 #include <sys/types.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <signal.h>
21 
22 #ifdef _POSIX_MEMLOCK
23 # include <sys/mman.h>
24 #endif
25 
26 static int sbd_pid = 0;
27 
29 {
34 };
35 
36 #define SYSRQ "/proc/sys/kernel/sysrq"
37 
38 void
40 {
41 #if SUPPORT_PROCFS
42  static bool need_init = true;
43  FILE* procf;
44  int c;
45 
46  if(need_init) {
47  need_init = false;
48  } else {
49  return;
50  }
51 
52  procf = fopen(SYSRQ, "r");
53  if (!procf) {
54  crm_perror(LOG_WARNING, "Cannot open "SYSRQ" for read");
55  return;
56  }
57  if (fscanf(procf, "%d", &c) != 1) {
58  crm_perror(LOG_ERR, "Parsing "SYSRQ" failed");
59  c = 0;
60  }
61  fclose(procf);
62  if (c == 1)
63  return;
64 
65  /* 8 for debugging dumps of processes, 128 for reboot/poweroff */
66  c |= 136;
67  procf = fopen(SYSRQ, "w");
68  if (!procf) {
69  crm_perror(LOG_ERR, "Cannot write to "SYSRQ);
70  return;
71  }
72  fprintf(procf, "%d", c);
73  fclose(procf);
74 #endif // SUPPORT_PROCFS
75  return;
76 }
77 
78 static void
79 sysrq_trigger(char t)
80 {
81 #if SUPPORT_PROCFS
82  FILE *procf;
83 
84  sysrq_init();
85 
86  procf = fopen("/proc/sysrq-trigger", "a");
87  if (!procf) {
88  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
89  return;
90  }
91  crm_info("sysrq-trigger: %c", t);
92  fprintf(procf, "%c\n", t);
93  fclose(procf);
94 #endif // SUPPORT_PROCFS
95  return;
96 }
97 
98 
99 static void
100 pcmk_panic_local(void)
101 {
102  int rc = pcmk_ok;
103  uid_t uid = geteuid();
104  pid_t ppid = getppid();
105 
106  if(uid != 0 && ppid > 1) {
107  /* We're a non-root pacemaker daemon (cib, crmd, pengine,
108  * attrd, etc) with the original pacemakerd parent
109  *
110  * Of these, only crmd is likely to be initiating resets
111  */
112  do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
114  return;
115 
116  } else if (uid != 0) {
117 #if SUPPORT_PROCFS
118  /*
119  * No permissions, and no pacemakerd parent to escalate to.
120  * Track down the new pacemakerd process and send a signal instead.
121  */
122  union sigval signal_value;
123 
124  memset(&signal_value, 0, sizeof(signal_value));
125  ppid = crm_procfs_pid_of("pacemakerd");
126  do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
127 
128  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
129  crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
130  }
131 #endif // SUPPORT_PROCFS
132 
133  /* The best we can do now is die */
135  return;
136  }
137 
138  /* We're either pacemakerd, or a pacemaker daemon running as root */
139 
140  if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
141  sysrq_trigger('c');
142  } else {
143  sysrq_trigger('b');
144  }
145  /* reboot(RB_HALT_SYSTEM); rc = errno; */
146  reboot(RB_AUTOBOOT);
147  rc = errno;
148 
149  do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
150 
151  if(ppid > 1) {
152  /* child daemon */
153  exit(pcmk_err_panic);
154  } else {
155  /* pacemakerd or orphan child */
156  exit(DAEMON_RESPAWN_STOP);
157  }
158 }
159 
160 static void
161 pcmk_panic_sbd(void)
162 {
163  union sigval signal_value;
164  pid_t ppid = getppid();
165 
166  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
167 
168  memset(&signal_value, 0, sizeof(signal_value));
169  /* TODO: Arrange for a slightly less brutal option? */
170  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
171  crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
172  pcmk_panic_local();
173  }
174 
175  if(ppid > 1) {
176  /* child daemon */
177  exit(pcmk_err_panic);
178  } else {
179  /* pacemakerd or orphan child */
180  exit(DAEMON_RESPAWN_STOP);
181  }
182 }
183 
184 void
185 pcmk_panic(const char *origin)
186 {
187  static struct qb_log_callsite *panic_cs = NULL;
188 
189  if (panic_cs == NULL) {
190  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
191  }
192 
193  /* Ensure sbd_pid is set */
194  (void)pcmk_locate_sbd();
195 
196  if (panic_cs && panic_cs->targets) {
197  /* getppid() == 1 means our original parent no longer exists */
198  do_crm_log_always(LOG_EMERG,
199  "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d",
200  origin, sbd_pid, getppid());
202  return;
203  }
204 
205  if(sbd_pid > 1) {
206  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
207  pcmk_panic_sbd();
208 
209  } else {
210  do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin);
211  pcmk_panic_local();
212  }
213 }
214 
215 pid_t
217 {
218  char *pidfile = NULL;
219  char *sbd_path = NULL;
220 
221  if(sbd_pid > 1) {
222  return sbd_pid;
223  }
224 
225  /* Look for the pid file */
226  pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR);
227  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
228 
229  /* Read the pid file */
230  CRM_ASSERT(pidfile);
231 
232  sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path);
233  if(sbd_pid > 0) {
234  crm_trace("SBD detected at pid=%d (file)", sbd_pid);
235 
236 #if SUPPORT_PROCFS
237  } else {
238  /* Fall back to /proc for systems that support it */
239  sbd_pid = crm_procfs_pid_of("sbd");
240  crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
241 #endif // SUPPORT_PROCFS
242  }
243 
244  if(sbd_pid < 0) {
245  sbd_pid = 0;
246  crm_trace("SBD not detected");
247  }
248 
249  free(pidfile);
250  free(sbd_path);
251 
252  return sbd_pid;
253 }
254 
255 long
257 {
258  const char *env_value = getenv("SBD_WATCHDOG_TIMEOUT");
259  long sbd_timeout = crm_get_msec(env_value);
260 
261  return sbd_timeout;
262 }
263 
264 gboolean
265 check_sbd_timeout(const char *value)
266 {
267  long st_timeout = value? crm_get_msec(value) : 0;
268 
269  if (st_timeout <= 0) {
270  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
271  value? value : "default");
272 
273  } else if (pcmk_locate_sbd() == 0) {
274  do_crm_log_always(LOG_EMERG,
275  "Shutting down: stonith-watchdog-timeout configured (%s) but SBD not active",
276  value);
278  return FALSE;
279 
280  } else {
281  long sbd_timeout = crm_get_sbd_timeout();
282 
283  if (st_timeout < sbd_timeout) {
284  do_crm_log_always(LOG_EMERG,
285  "Shutting down: stonith-watchdog-timeout (%s) too short (must be >%ldms)",
286  value, sbd_timeout);
288  return FALSE;
289  }
290  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
291  value, sbd_timeout);
292  }
293  return TRUE;
294 }
#define LOG_TRACE
Definition: logging.h:29
long crm_pidfile_inuse(const char *filename, long mypid, const char *daemon)
Definition: utils.c:818
const char * pcmk_strerror(int rc)
Definition: logging.c:1139
void sysrq_init(void)
Definition: watchdog.c:39
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:216
#define pcmk_ok
Definition: error.h:42
long long crm_get_msec(const char *input)
Definition: utils.c:598
unsigned int crm_trace_nonlog
Definition: logging.c:48
pcmk_panic_flags
Definition: watchdog.c:28
long crm_get_sbd_timeout(void)
Definition: watchdog.c:256
#define crm_debug(fmt, args...)
Definition: logging.h:253
#define crm_trace(fmt, args...)
Definition: logging.h:254
#define HA_STATE_DIR
Definition: config.h:575
#define do_crm_log_always(level, fmt, args...)
Log a message using constant severity.
Definition: logging.h:213
#define pcmk_err_panic
Definition: error.h:57
#define DAEMON_RESPAWN_STOP
Definition: crm.h:55
#define SBIN_DIR
Definition: config.h:685
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:226
gboolean check_sbd_timeout(const char *value)
Definition: watchdog.c:265
#define CRM_ASSERT(expr)
Definition: error.h:35
int crm_exit(int rc)
Definition: utils.c:83
#define SYSRQ
Definition: watchdog.c:36
#define safe_str_eq(a, b)
Definition: util.h:72
int crm_procfs_pid_of(const char *name)
Definition: procfs.c:118
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
void pcmk_panic(const char *origin)
Definition: watchdog.c:185
#define crm_info(fmt, args...)
Definition: logging.h:251