View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.util.ArrayList;
22  import java.util.HashSet;
23  import java.util.LinkedList;
24  import java.util.List;
25  import java.util.Set;
26  
27  import org.apache.commons.lang.math.RandomUtils;
28  import org.apache.hadoop.hbase.ClusterStatus;
29  import org.apache.hadoop.hbase.ServerName;
30  import org.junit.Assert;
31  
32  /** This action is too specific to put in ChaosMonkey; put it here */
33  public class UnbalanceKillAndRebalanceAction extends Action {
34    /** Fractions of servers to get regions and live and die respectively; from all other
35     * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */
36    private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
37    private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
38    private static final double HOARD_FRC_OF_REGIONS = 0.8;
39    /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance
40     * and restarting the servers; to make sure these events have time to impact the cluster. */
41    private long waitForUnbalanceMilliSec;
42    private long waitForKillsMilliSec;
43    private long waitAfterBalanceMilliSec;
44  
45    public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance) {
46      super();
47      waitForUnbalanceMilliSec = waitUnbalance;
48      waitForKillsMilliSec = waitKill;
49      waitAfterBalanceMilliSec = waitAfterBalance;
50    }
51  
52    @Override
53    public void perform() throws Exception {
54      ClusterStatus status = this.cluster.getClusterStatus();
55      List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
56      Set<ServerName> killedServers = new HashSet<ServerName>();
57  
58      int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
59      int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
60      Assert.assertTrue((liveCount + deadCount) < victimServers.size());
61      List<ServerName> targetServers = new ArrayList<ServerName>(liveCount);
62      for (int i = 0; i < liveCount + deadCount; ++i) {
63        int victimIx = RandomUtils.nextInt(victimServers.size());
64        targetServers.add(victimServers.remove(victimIx));
65      }
66      unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
67      Thread.sleep(waitForUnbalanceMilliSec);
68      for (int i = 0; i < liveCount; ++i) {
69        // Don't keep killing servers if we're
70        // trying to stop the monkey.
71        if (context.isStopping()) {
72          break;
73        }
74        killRs(targetServers.get(i));
75        killedServers.add(targetServers.get(i));
76      }
77  
78      Thread.sleep(waitForKillsMilliSec);
79      forceBalancer();
80      Thread.sleep(waitAfterBalanceMilliSec);
81      for (ServerName server:killedServers) {
82        startRs(server);
83      }
84    }
85  }