check_cassandra_nodes.pl 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/perl -T
  2. # nagios: -epn
  3. #
  4. # Author: Hari Sekhon
  5. # Date: 2013-10-13 19:32:32 +0100 (Sun, 13 Oct 2013)
  6. #
  7. # https://github.com/harisekhon/nagios-plugins
  8. #
  9. # License: see accompanying LICENSE file
  10. #
  11. # TODO: check if I can rewrite a version of this via API
  12. my $max_num_down_nodes_to_output = 5;
  13. $DESCRIPTION = "Nagios Plugin to check the number of available cassandra nodes and raise warning/critical on down nodes.
  14. Uses nodetool's status command to determine how many downed nodes there are to compare against the warning/critical thresholds. Reports the addresses of up to $max_num_down_nodes_to_output nodes that are down in verbose mode. Always returns perfdata for graphing the node counts and states.
  15. Can specify a remote host and port otherwise assumes to check via localhost
  16. Tested on Cassandra 1.2.9, 2.0.1, 2.0.9, 2.2.5, 3.0.8, 3.5, 3.6, 3.7";
  17. $VERSION = "0.4.2";
  18. use strict;
  19. use warnings;
  20. BEGIN {
  21. use File::Basename;
  22. use lib dirname(__FILE__) . "/lib";
  23. }
  24. use HariSekhonUtils qw/:DEFAULT :regex/;
  25. use HariSekhon::Cassandra::Nodetool;
  26. set_threshold_defaults(0, 1);
  27. %options = (
  28. %nodetool_options,
  29. %thresholdoptions,
  30. );
  31. splice @usage_order, 0, 0, 'nodetool';
  32. get_options();
  33. ($nodetool, $host, $port, $user, $password) = validate_nodetool_options($nodetool, $host, $port, $user, $password);
  34. validate_thresholds(undef, undef, { "simple" => "upper", "integer" => 1, "positive" => 1});
  35. vlog2;
  36. set_timeout();
  37. $status = "OK";
  38. my $options = nodetool_options($host, $port, $user, $password);
  39. my $cmd = "${nodetool} ${options}status";
  40. vlog2 "fetching cluster nodes information";
  41. my @output = cmd($cmd);
  42. my $up_nodes = 0;
  43. my $down_nodes = 0;
  44. my $normal_nodes = 0;
  45. my $leaving_nodes = 0;
  46. my $joining_nodes = 0;
  47. my $moving_nodes = 0;
  48. my @down_nodes;
  49. my $node_address;
  50. sub parse_state ($) {
  51. # Don't know what remote JMX auth failure looks like yet so will go critical on any user/password related message returned assuming that's an auth failure
  52. check_nodetool_errors($_);
  53. if(/^[UD][NLJM]\s+($host_regex)/){
  54. $node_address = $1;
  55. if(/^U/){
  56. $up_nodes++;
  57. } elsif(/^D/){
  58. $down_nodes++;
  59. push(@down_nodes, $node_address);
  60. }
  61. if(/^.N/){
  62. $normal_nodes++;
  63. } elsif(/^.L/){
  64. $leaving_nodes++;
  65. } elsif(/^.J/){
  66. $joining_nodes++;
  67. } elsif(/^.M/){
  68. $moving_nodes++;
  69. } else {
  70. quit "UNKNOWN", "unrecognized second column for node status, $nagios_plugins_support_msg";
  71. }
  72. return 1;
  73. } elsif($_ =~ $nodetool_status_header_regex){
  74. # ignore
  75. } elsif(skip_nodetool_output($_)){
  76. # ignore
  77. } else {
  78. die_nodetool_unrecognized_output($_);
  79. }
  80. }
  81. foreach(@output){
  82. parse_state($_);
  83. }
  84. vlog2 "checking node counts and number of nodes down";
  85. if(@down_nodes){
  86. quit "UNKNOWN", "inconsistent nodes down count vs nodes down addresses, probably a parsing error in parse_state(). $nagios_plugins_support_msg" unless $down_nodes;
  87. plural $down_nodes;
  88. vlog2("$down_nodes node$plural down: " . join(", ", @down_nodes) );
  89. }
  90. unless( ($up_nodes + $down_nodes ) == ($normal_nodes + $leaving_nodes + $joining_nodes + $moving_nodes)){
  91. quit "UNKNOWN", "live+down node counts vs (normal/leaving/joining/moving) nodes are not equal, investigation required";
  92. }
  93. $msg = "$up_nodes nodes up, $down_nodes down";
  94. check_thresholds($down_nodes);
  95. if($verbose and @down_nodes){
  96. plural scalar @down_nodes;
  97. $msg .= " [node$plural down: ";
  98. if(scalar @down_nodes > $max_num_down_nodes_to_output){
  99. for(my $i; $i < $max_num_down_nodes_to_output; $i++){
  100. $msg .= ", " . $down_nodes[$i];
  101. }
  102. $msg .= " ... ";
  103. } else {
  104. $msg .= join(", ", @down_nodes);
  105. }
  106. $msg .= "]";
  107. }
  108. $msg .= ", node states: $normal_nodes normal, $leaving_nodes leaving, $joining_nodes joining, $moving_nodes moving";
  109. $msg .= " | nodes_up=$up_nodes nodes_down=$down_nodes";
  110. msg_perf_thresholds();
  111. $msg .= " normal_nodes=$normal_nodes leaving_nodes=$leaving_nodes joining_nodes=$joining_nodes moving_nodes=$moving_nodes";
  112. vlog2;
  113. quit $status, $msg;