1 /** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 package org.apache.hadoop.hbase.master; 20 21 import java.io.IOException; 22 23 import org.apache.commons.logging.Log; 24 import org.apache.commons.logging.LogFactory; 25 import org.apache.hadoop.hbase.HBaseTestingUtility; 26 import org.apache.hadoop.hbase.HConstants; 27 import org.apache.hadoop.hbase.HRegionInfo; 28 import org.apache.hadoop.hbase.TableName; 29 import org.apache.hadoop.hbase.client.Durability; 30 import org.apache.hadoop.hbase.client.HTable; 31 import org.apache.hadoop.hbase.client.Put; 32 import org.apache.hadoop.hbase.client.RegionLocator; 33 import org.apache.hadoop.hbase.client.Result; 34 import org.apache.hadoop.hbase.client.ResultScanner; 35 import org.apache.hadoop.hbase.client.Scan; 36 import org.apache.hadoop.hbase.client.Table; 37 import org.apache.hadoop.hbase.testclassification.LargeTests; 38 import org.apache.hadoop.hbase.util.Bytes; 39 import org.junit.AfterClass; 40 import org.junit.Assert; 41 import org.junit.Before; 42 import org.junit.BeforeClass; 43 import org.junit.Ignore; 44 import org.junit.Test; 45 import org.junit.experimental.categories.Category; 46 47 /** 48 * Test transitions of state across the master. Sets up the cluster once and 49 * then runs a couple of tests. 50 */ 51 @Category(LargeTests.class) 52 public class TestMasterTransitions { 53 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); 54 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 55 private static final TableName TABLENAME = TableName.valueOf("master_transitions"); 56 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), 57 Bytes.toBytes("b"), Bytes.toBytes("c")}; 58 59 /** 60 * Start up a mini cluster and put a small table of many empty regions into it. 61 * @throws Exception 62 */ 63 @BeforeClass public static void beforeAllTests() throws Exception { 64 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); 65 TEST_UTIL.startMiniCluster(2); 66 // Create a table of three families. This will assign a region. 67 TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES); 68 HTable t = (HTable) TEST_UTIL.getConnection().getTable(TABLENAME); 69 int countOfRegions = -1; 70 try (RegionLocator r = t.getRegionLocator()) { 71 countOfRegions = r.getStartKeys().length; 72 } 73 TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME); 74 addToEachStartKey(countOfRegions); 75 t.close(); 76 } 77 78 @AfterClass public static void afterAllTests() throws Exception { 79 TEST_UTIL.shutdownMiniCluster(); 80 } 81 82 @Before public void setup() throws IOException { 83 TEST_UTIL.ensureSomeRegionServersAvailable(2); 84 } 85 86 /** 87 * Listener for regionserver events testing hbase-2428 (Infinite loop of 88 * region closes if hbase:meta region is offline). In particular, listen 89 * for the close of the 'metaServer' and when it comes in, requeue it with a 90 * delay as though there were an issue processing the shutdown. As part of 91 * the requeuing, send over a close of a region on 'otherServer' so it comes 92 * into a master that has its meta region marked as offline. 93 */ 94 /* 95 static class HBase2428Listener implements RegionServerOperationListener { 96 // Map of what we've delayed so we don't do do repeated delays. 97 private final Set<RegionServerOperation> postponed = 98 new CopyOnWriteArraySet<RegionServerOperation>(); 99 private boolean done = false;; 100 private boolean metaShutdownReceived = false; 101 private final HServerAddress metaAddress; 102 private final MiniHBaseCluster cluster; 103 private final int otherServerIndex; 104 private final HRegionInfo hri; 105 private int closeCount = 0; 106 static final int SERVER_DURATION = 3 * 1000; 107 static final int CLOSE_DURATION = 1 * 1000; 108 109 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, 110 final HRegionInfo closingHRI, final int otherServerIndex) { 111 this.cluster = c; 112 this.metaAddress = metaAddress; 113 this.hri = closingHRI; 114 this.otherServerIndex = otherServerIndex; 115 } 116 117 @Override 118 public boolean process(final RegionServerOperation op) throws IOException { 119 // If a regionserver shutdown and its of the meta server, then we want to 120 // delay the processing of the shutdown and send off a close of a region on 121 // the 'otherServer. 122 boolean result = true; 123 if (op instanceof ProcessServerShutdown) { 124 ProcessServerShutdown pss = (ProcessServerShutdown)op; 125 if (pss.getDeadServerAddress().equals(this.metaAddress)) { 126 // Don't postpone more than once. 127 if (!this.postponed.contains(pss)) { 128 // Close some region. 129 this.cluster.addMessageToSendRegionServer(this.otherServerIndex, 130 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, 131 Bytes.toBytes("Forcing close in test"))); 132 this.postponed.add(pss); 133 // Put off the processing of the regionserver shutdown processing. 134 pss.setDelay(SERVER_DURATION); 135 this.metaShutdownReceived = true; 136 // Return false. This will add this op to the delayed queue. 137 result = false; 138 } 139 } 140 } else { 141 // Have the close run frequently. 142 if (isWantedCloseOperation(op) != null) { 143 op.setDelay(CLOSE_DURATION); 144 // Count how many times it comes through here. 145 this.closeCount++; 146 } 147 } 148 return result; 149 } 150 151 public void processed(final RegionServerOperation op) { 152 if (isWantedCloseOperation(op) != null) return; 153 this.done = true; 154 } 155 */ 156 /* 157 * @param op 158 * @return Null if not the wanted ProcessRegionClose, else <code>op</code> 159 * cast as a ProcessRegionClose. 160 */ 161 /* 162 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { 163 // Count every time we get a close operation. 164 if (op instanceof ProcessRegionClose) { 165 ProcessRegionClose c = (ProcessRegionClose)op; 166 if (c.regionInfo.equals(hri)) { 167 return c; 168 } 169 } 170 return null; 171 } 172 173 boolean isDone() { 174 return this.done; 175 } 176 177 boolean isMetaShutdownReceived() { 178 return metaShutdownReceived; 179 } 180 181 int getCloseCount() { 182 return this.closeCount; 183 } 184 185 @Override 186 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 187 return true; 188 } 189 } 190 */ 191 /** 192 * In 2428, the meta region has just been set offline and then a close comes 193 * in. 194 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 195 */ 196 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() 197 throws Exception { 198 /* 199 LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); 200 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 201 final HMaster master = cluster.getMaster(); 202 int metaIndex = cluster.getServerWithMeta(); 203 // Figure the index of the server that is not server the hbase:meta 204 int otherServerIndex = -1; 205 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { 206 if (i == metaIndex) continue; 207 otherServerIndex = i; 208 break; 209 } 210 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); 211 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); 212 213 // Get a region out on the otherServer. 214 final HRegionInfo hri = 215 otherServer.getOnlineRegions().iterator().next().getRegionInfo(); 216 217 // Add our RegionServerOperationsListener 218 HBase2428Listener listener = new HBase2428Listener(cluster, 219 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 220 master.getRegionServerOperationQueue(). 221 registerRegionServerOperationListener(listener); 222 try { 223 // Now close the server carrying meta. 224 cluster.abortRegionServer(metaIndex); 225 226 // First wait on receipt of meta server shutdown message. 227 while(!listener.metaShutdownReceived) Threads.sleep(100); 228 while(!listener.isDone()) Threads.sleep(10); 229 // We should not have retried the close more times than it took for the 230 // server shutdown message to exit the delay queue and get processed 231 // (Multiple by two to add in some slop in case of GC or something). 232 assertTrue(listener.getCloseCount() > 1); 233 assertTrue(listener.getCloseCount() < 234 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); 235 236 // Assert the closed region came back online 237 assertRegionIsBackOnline(hri); 238 } finally { 239 master.getRegionServerOperationQueue(). 240 unregisterRegionServerOperationListener(listener); 241 } 242 */ 243 } 244 245 /** 246 * Test adding in a new server before old one on same host+port is dead. 247 * Make the test more onerous by having the server under test carry the meta. 248 * If confusion between old and new, purportedly meta never comes back. Test 249 * that meta gets redeployed. 250 */ 251 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() 252 throws IOException { 253 /* 254 LOG.info("Running testAddingServerBeforeOldIsDead2413"); 255 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 256 int count = count(); 257 int metaIndex = cluster.getServerWithMeta(); 258 MiniHBaseClusterRegionServer metaHRS = 259 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); 260 int port = metaHRS.getServerInfo().getServerAddress().getPort(); 261 Configuration c = TEST_UTIL.getConfiguration(); 262 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); 263 try { 264 LOG.info("KILLED=" + metaHRS); 265 metaHRS.kill(); 266 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); 267 // Try and start new regionserver. It might clash with the old 268 // regionserver port so keep trying to get past the BindException. 269 HRegionServer hrs = null; 270 while (true) { 271 try { 272 hrs = cluster.startRegionServer().getRegionServer(); 273 break; 274 } catch (IOException e) { 275 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { 276 InvocationTargetException ee = (InvocationTargetException)e.getCause(); 277 if (ee.getCause() != null && ee.getCause() instanceof BindException) { 278 LOG.info("BindException; retrying: " + e.toString()); 279 } 280 } 281 } 282 } 283 LOG.info("STARTED=" + hrs); 284 // Wait until he's been given at least 3 regions before we go on to try 285 // and count rows in table. 286 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); 287 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 288 " regions"); 289 assertEquals(count, count()); 290 } finally { 291 c.set(HConstants.REGIONSERVER_PORT, oldPort); 292 } 293 */ 294 } 295 296 /** 297 * HBase2482 is about outstanding region openings. If any are outstanding 298 * when a regionserver goes down, then they'll never deploy. They'll be 299 * stuck in the regions-in-transition list for ever. This listener looks 300 * for a region opening HMsg and if its from the server passed on construction, 301 * then we kill it. It also looks out for a close message on the victim 302 * server because that signifies start of the fireworks. 303 */ 304 /* 305 static class HBase2482Listener implements RegionServerOperationListener { 306 private final HRegionServer victim; 307 private boolean abortSent = false; 308 // We closed regions on new server. 309 private volatile boolean closed = false; 310 // Copy of regions on new server 311 private final Collection<HRegion> copyOfOnlineRegions; 312 // This is the region that was in transition on the server we aborted. Test 313 // passes if this region comes back online successfully. 314 private HRegionInfo regionToFind; 315 316 HBase2482Listener(final HRegionServer victim) { 317 this.victim = victim; 318 // Copy regions currently open on this server so I can notice when 319 // there is a close. 320 this.copyOfOnlineRegions = 321 this.victim.getCopyOfOnlineRegionsSortedBySize().values(); 322 } 323 324 @Override 325 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 326 if (!victim.getServerInfo().equals(serverInfo) || 327 this.abortSent || !this.closed) { 328 return true; 329 } 330 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; 331 // Save the region that is in transition so can test later it came back. 332 this.regionToFind = incomingMsg.getRegionInfo(); 333 String msg = "ABORTING " + this.victim + " because got a " + 334 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 335 incomingMsg.getRegionInfo().getRegionNameAsString(); 336 this.victim.abort(msg); 337 this.abortSent = true; 338 return true; 339 } 340 341 @Override 342 public boolean process(RegionServerOperation op) throws IOException { 343 return true; 344 } 345 346 @Override 347 public void processed(RegionServerOperation op) { 348 if (this.closed || !(op instanceof ProcessRegionClose)) return; 349 ProcessRegionClose close = (ProcessRegionClose)op; 350 for (HRegion r: this.copyOfOnlineRegions) { 351 if (r.getRegionInfo().equals(close.regionInfo)) { 352 // We've closed one of the regions that was on the victim server. 353 // Now can start testing for when all regions are back online again 354 LOG.info("Found close of " + 355 r.getRegionInfo().getRegionNameAsString() + 356 "; setting close happened flag"); 357 this.closed = true; 358 break; 359 } 360 } 361 } 362 } 363 */ 364 /** 365 * In 2482, a RS with an opening region on it dies. The said region is then 366 * stuck in the master's regions-in-transition and never leaves it. This 367 * test works by bringing up a new regionserver, waiting for the load 368 * balancer to give it some regions. Then, we close all on the new server. 369 * After sending all the close messages, we send the new regionserver the 370 * special blocking message so it can not process any more messages. 371 * Meantime reopening of the just-closed regions is backed up on the new 372 * server. Soon as master gets an opening region from the new regionserver, 373 * we kill it. We then wait on all regions to come back on line. If bug 374 * is fixed, this should happen soon as the processing of the killed server is 375 * done. 376 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 377 */ 378 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() 379 throws Exception { 380 /* 381 LOG.info("Running testKillRSWithOpeningRegion2482"); 382 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 383 if (cluster.getLiveRegionServerThreads().size() < 2) { 384 // Need at least two servers. 385 cluster.startRegionServer(); 386 } 387 // Count how many regions are online. They need to be all back online for 388 // this test to succeed. 389 int countOfMetaRegions = countOfMetaRegions(); 390 // Add a listener on the server. 391 HMaster m = cluster.getMaster(); 392 // Start new regionserver. 393 MiniHBaseClusterRegionServer hrs = 394 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 395 LOG.info("Started new regionserver: " + hrs.toString()); 396 // Wait until has some regions before proceeding. Balancer will give it some. 397 int minimumRegions = 398 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); 399 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); 400 // Set the listener only after some regions have been opened on new server. 401 HBase2482Listener listener = new HBase2482Listener(hrs); 402 m.getRegionServerOperationQueue(). 403 registerRegionServerOperationListener(listener); 404 try { 405 // Go close all non-catalog regions on this new server 406 closeAllNonCatalogRegions(cluster, hrs); 407 // After all closes, add blocking message before the region opens start to 408 // come in. 409 cluster.addMessageToSendRegionServer(hrs, 410 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); 411 // Wait till one of the above close messages has an effect before we start 412 // wait on all regions back online. 413 while (!listener.closed) Threads.sleep(100); 414 LOG.info("Past close"); 415 // Make sure the abort server message was sent. 416 while(!listener.abortSent) Threads.sleep(100); 417 LOG.info("Past abort send; waiting on all regions to redeploy"); 418 // Now wait for regions to come back online. 419 assertRegionIsBackOnline(listener.regionToFind); 420 } finally { 421 m.getRegionServerOperationQueue(). 422 unregisterRegionServerOperationListener(listener); 423 } 424 */ 425 } 426 427 /* 428 * @return Count of all non-catalog regions on the designated server 429 */ 430 /* 431 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, 432 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) 433 throws IOException { 434 int countOfRegions = 0; 435 for (HRegion r: hrs.getOnlineRegions()) { 436 if (r.getRegionInfo().isMetaRegion()) continue; 437 cluster.addMessageToSendRegionServer(hrs, 438 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); 439 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 440 " on " + hrs.toString()); 441 countOfRegions++; 442 } 443 return countOfRegions; 444 } 445 446 private void assertRegionIsBackOnline(final HRegionInfo hri) 447 throws IOException { 448 // Region should have an entry in its startkey because of addRowToEachRegion. 449 byte [] row = getStartKey(hri); 450 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 451 Get g = new Get(row); 452 assertTrue((t.get(g)).size() > 0); 453 } 454 455 /* 456 * @return Count of regions in meta table. 457 * @throws IOException 458 */ 459 /* 460 private static int countOfMetaRegions() 461 throws IOException { 462 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 463 HConstants.META_TABLE_NAME); 464 int rows = 0; 465 Scan scan = new Scan(); 466 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 467 ResultScanner s = meta.getScanner(scan); 468 for (Result r = null; (r = s.next()) != null;) { 469 byte [] b = 470 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 471 if (b == null || b.length <= 0) break; 472 rows++; 473 } 474 s.close(); 475 return rows; 476 } 477 */ 478 /* 479 * Add to each of the regions in hbase:meta a value. Key is the startrow of the 480 * region (except its 'aaa' for first region). Actual value is the row name. 481 * @param expected 482 * @return 483 * @throws IOException 484 */ 485 private static int addToEachStartKey(final int expected) throws IOException { 486 Table t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 487 Table meta = new HTable(TEST_UTIL.getConfiguration(), 488 TableName.META_TABLE_NAME); 489 int rows = 0; 490 Scan scan = new Scan(); 491 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 492 ResultScanner s = meta.getScanner(scan); 493 for (Result r = null; (r = s.next()) != null;) { 494 HRegionInfo hri = HRegionInfo.getHRegionInfo(r); 495 if (hri == null) break; 496 if (!hri.getTable().equals(TABLENAME)) { 497 continue; 498 } 499 500 // If start key, add 'aaa'. 501 if(!hri.getTable().equals(TABLENAME)) { 502 continue; 503 } 504 byte [] row = getStartKey(hri); 505 Put p = new Put(row); 506 p.setDurability(Durability.SKIP_WAL); 507 p.add(getTestFamily(), getTestQualifier(), row); 508 t.put(p); 509 rows++; 510 } 511 s.close(); 512 Assert.assertEquals(expected, rows); 513 t.close(); 514 meta.close(); 515 return rows; 516 } 517 518 /* 519 * @param hri 520 * @return Start key for hri (If start key is '', then return 'aaa'. 521 */ 522 private static byte [] getStartKey(final HRegionInfo hri) { 523 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? 524 Bytes.toBytes("aaa"): hri.getStartKey(); 525 } 526 527 private static byte [] getTestFamily() { 528 return FAMILIES[0]; 529 } 530 531 private static byte [] getTestQualifier() { 532 return getTestFamily(); 533 } 534 }