1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.chaos.actions;
20
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.LinkedList;
24 import java.util.List;
25 import java.util.Queue;
26
27 import org.apache.commons.lang.math.RandomUtils;
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30 import org.apache.hadoop.hbase.ServerName;
31 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
32
33
34
35
36
37
38 public class RollingBatchRestartRsAction extends BatchRestartRsAction {
39 private static final Log LOG = LogFactory.getLog(RollingBatchRestartRsAction.class);
40 protected int maxDeadServers;
41
42 public RollingBatchRestartRsAction(long sleepTime, float ratio) {
43 this(sleepTime, ratio, 5);
44 }
45
46 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
47 super(sleepTime, ratio);
48 this.maxDeadServers = maxDeadServers;
49 }
50
51 enum KillOrStart {
52 KILL,
53 START
54 }
55
56 @Override
57 public void perform() throws Exception {
58 LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
59 (int)(ratio * 100)));
60 List<ServerName> selectedServers = selectServers();
61
62 Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
63 Queue<ServerName> deadServers = new LinkedList<ServerName>();
64
65
66 while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) {
67 KillOrStart action = KillOrStart.KILL;
68
69 if (serversToBeKilled.isEmpty()) {
70 action = KillOrStart.START;
71 } else if (deadServers.isEmpty()) {
72 action = KillOrStart.KILL;
73 } else if (deadServers.size() >= maxDeadServers) {
74
75 action = KillOrStart.START;
76 } else {
77
78 action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
79 }
80
81 ServerName server;
82
83 switch (action) {
84 case KILL:
85 server = serversToBeKilled.remove();
86 try {
87 killRs(server);
88 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
89
90
91 LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e);
92 }
93 deadServers.add(server);
94 break;
95 case START:
96 try {
97 server = deadServers.remove();
98 startRs(server);
99 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
100
101
102 LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
103 }
104 break;
105 }
106
107 sleep(RandomUtils.nextInt((int)sleepTime));
108 }
109 }
110
111 protected List<ServerName> selectServers() throws IOException {
112 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
113 }
114
115
116
117
118
119
120 public static void main(final String[] args) throws Exception {
121 RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
122 private int invocations = 0;
123 @Override
124 protected ServerName[] getCurrentServers() throws IOException {
125 final int count = 4;
126 List<ServerName> serverNames = new ArrayList<ServerName>(count);
127 for (int i = 0; i < 4; i++) {
128 serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
129 }
130 return serverNames.toArray(new ServerName[serverNames.size()]);
131 }
132
133 @Override
134 protected void killRs(ServerName server) throws IOException {
135 LOG.info("Killed " + server);
136 if (this.invocations++ % 3 == 0) {
137 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
138 }
139 }
140
141 @Override
142 protected void startRs(ServerName server) throws IOException {
143 LOG.info("Started " + server);
144 if (this.invocations++ % 3 == 0) {
145 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
146 }
147 }
148 };
149
150 action.perform();
151 }
152 }