1 package org.apache.jcs.auxiliary.remote;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import java.io.IOException;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.jcs.engine.CacheConstants;
27 import org.apache.jcs.engine.behavior.ICache;
28 import org.apache.jcs.engine.behavior.ICompositeCacheManager;
29 import org.apache.jcs.engine.behavior.IElementSerializer;
30 import org.apache.jcs.engine.logging.behavior.ICacheEventLogger;
31
32 /**
33 * The RemoteCacheFailoverRunner tries to establish a connection with a failover
34 * server, if any are defined. Once a failover connection is made, it will
35 * attempt to replace the failover with the primary remote server.
36 * <p>
37 * It works by switching out the RemoteCacheNoWait inside the Facade.
38 * <p>
39 * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
40 * This facade is created by the RemoteCacheFactory. The factory maintains a set
41 * of managers, one for each remote server. Typically, there will only be one
42 * manager.
43 * <p>
44 * If you use multiple remote servers, you may want to set one or more as
45 * failovers. If a local cache cannot connect to the primary server, or looses
46 * its connection to the primary server, it will attempt to restore that
47 * Connection in the background. If failovers are defined, the Failover runner
48 * will try to connect to a failover until the primary is restored.
49 *
50 */
51 public class RemoteCacheFailoverRunner
52 implements Runnable
53 {
54 /** The logger */
55 private final static Log log = LogFactory.getLog( RemoteCacheFailoverRunner.class );
56
57 /** The facade returned to the composite cache. */
58 private final RemoteCacheNoWaitFacade facade;
59
60 /** How long to wait between reconnect attempts. */
61 private static long idlePeriod = 20 * 1000;
62
63 /** Have we reconnected. */
64 private boolean alright = true;
65
66 /** The cache manager */
67 private final ICompositeCacheManager cacheMgr;
68
69 /** The event logger. */
70 private final ICacheEventLogger cacheEventLogger;
71
72 /** The serializer. */
73 private final IElementSerializer elementSerializer;
74
75 /**
76 * Constructor for the RemoteCacheFailoverRunner object. This allows the
77 * FailoverRunner to modify the facade that the CompositeCache references.
78 *
79 * @param facade
80 * the facade the CompositeCache talks to.
81 * @param cacheMgr
82 * @param cacheEventLogger
83 * @param elementSerializer
84 */
85 public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade facade, ICompositeCacheManager cacheMgr,
86 ICacheEventLogger cacheEventLogger, IElementSerializer elementSerializer )
87 {
88 this.facade = facade;
89 this.cacheMgr = cacheMgr;
90 this.cacheEventLogger = cacheEventLogger;
91 this.elementSerializer = elementSerializer;
92 }
93
94 /**
95 * Notifies the cache monitor that an error occurred, and kicks off the
96 * error recovery process.
97 */
98 public void notifyError()
99 {
100 bad();
101 synchronized ( this )
102 {
103 notify();
104 }
105 }
106
107 /**
108 * Main processing method for the RemoteCacheFailoverRunner object.
109 * <p>
110 * If we do not have a connection with any failover server, this will try to
111 * connect one at a time. If no connection can be made, it goes to sleep for
112 * a while (20 seconds).
113 * <p>
114 * Once a connection with a failover is made, we will try to reconnect to
115 * the primary server.
116 * <p>
117 * The primary server is the first server defines in the FailoverServers
118 * list.
119 */
120 public void run()
121 {
122 // start the main work of connecting to a failover and then restoring
123 // the primary.
124 connectAndRestore();
125
126 if ( log.isInfoEnabled() )
127 {
128 log.info( "Exiting failover runner. Failover index = " + facade.remoteCacheAttributes.getFailoverIndex() );
129 if ( facade.remoteCacheAttributes.getFailoverIndex() <= 0 )
130 {
131 log.info( "Failover index is <= 0, meaning we are not " + "connected to a failover server." );
132 }
133 else if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
134 {
135 log.info( "Failover index is > 0, meaning we are " + "connected to a failover server." );
136 }
137 // log if we are alright or not.
138 }
139 }
140
141 /**
142 * This is the main loop. If there are failovers defined, then this will
143 * continue until the primary is re-connected. If no failovers are defined,
144 * this will exit automatically.
145 */
146 private void connectAndRestore()
147 {
148 do
149 {
150 log.info( "Remote cache FAILOVER RUNNING." );
151
152 // there is no active listener
153 if ( !alright )
154 {
155 // Monitor each RemoteCacheManager instance one after the other.
156 // Each RemoteCacheManager corresponds to one remote connection.
157 String[] failovers = facade.remoteCacheAttributes.getFailovers();
158 // we should probably check to see if there are any failovers,
159 // even though the caller
160 // should have already.
161
162 if ( failovers == null )
163 {
164 log.warn( "Remote is misconfigured, failovers was null." );
165 return;
166 }
167 else if ( failovers.length == 1 )
168 {
169 // if there is only the primary, return out of this
170 if ( log.isInfoEnabled() )
171 {
172 log.info( "No failovers defined, exiting failover runner." );
173 return;
174 }
175 }
176
177 int fidx = facade.remoteCacheAttributes.getFailoverIndex();
178 log.debug( "fidx = " + fidx + " failovers.length = " + failovers.length );
179
180 // shouldn't we see if the primary is backup?
181 // If we don't check the primary, if it gets connected in the
182 // background,
183 // we will disconnect it only to put it right back
184 int i = fidx; // + 1; // +1 skips the primary
185 if ( log.isDebugEnabled() )
186 {
187 log.debug( "stating at failover i = " + i );
188 }
189
190 // try them one at a time until successful
191 for ( ; i < failovers.length && !alright; i++ )
192 {
193 String server = failovers[i];
194 if ( log.isDebugEnabled() )
195 {
196 log.debug( "Trying server [" + server + "] at failover index i = " + i );
197 }
198
199 RemoteCacheAttributes rca = null;
200 try
201 {
202 rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
203 rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
204 rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
205 RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
206
207 if ( log.isDebugEnabled() )
208 {
209 log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
210 }
211
212 // add a listener if there are none, need to tell rca
213 // what number it is at
214 ICache ic = rcm.getCache( rca.getCacheName() );
215 if ( ic != null )
216 {
217 if ( ic.getStatus() == CacheConstants.STATUS_ALIVE )
218 {
219 // may need to do this more gracefully
220 log.debug( "reseting no wait" );
221 facade.noWaits = new RemoteCacheNoWait[1];
222 facade.noWaits[0] = (RemoteCacheNoWait) ic;
223 facade.remoteCacheAttributes.setFailoverIndex( i );
224
225 synchronized ( this )
226 {
227 if ( log.isDebugEnabled() )
228 {
229 log.debug( "setting ALRIGHT to true" );
230 if ( i > 0 )
231 {
232 log.debug( "Moving to Primary Recovery Mode, failover index = " + i );
233 }
234 else
235 {
236 if ( log.isInfoEnabled() )
237 {
238 String message = "No need to connect to failover, the primary server is back up.";
239 log.info( message );
240 }
241 }
242 }
243
244 alright = true;
245
246 if ( log.isInfoEnabled() )
247 {
248 log.info( "CONNECTED to host = [" + rca.getRemoteHost() + "] port = ["
249 + rca.getRemotePort() + "]" );
250 }
251 }
252 }
253 }
254 else
255 {
256 log.info( "noWait is null" );
257 }
258 }
259 catch ( Exception ex )
260 {
261 bad();
262 // Problem encountered in fixing the caches managed by a
263 // RemoteCacheManager instance.
264 // Soldier on to the next RemoteCacheManager instance.
265 if ( i == 0 )
266 {
267 log.warn( "FAILED to connect, as expected, to primary" + rca.getRemoteHost() + ":"
268 + rca.getRemotePort(), ex );
269 }
270 else
271 {
272 log.error( "FAILED to connect to failover [" + rca.getRemoteHost() + ":"
273 + rca.getRemotePort() + "]", ex );
274 }
275 }
276 }
277 }
278 // end if !alright
279 // get here if while index >0 and alright, meaning that we are
280 // connected to some backup server.
281 else
282 {
283 if ( log.isDebugEnabled() )
284 {
285 log.debug( "ALRIGHT is true " );
286 }
287 if ( log.isInfoEnabled() )
288 {
289 log.info( "Failover runner is in primary recovery mode. Failover index = "
290 + facade.remoteCacheAttributes.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
291 }
292 }
293
294 boolean primaryRestoredSuccessfully = false;
295 // if we are not connected to the primary, try.
296 if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
297 {
298 primaryRestoredSuccessfully = restorePrimary();
299 if ( log.isDebugEnabled() )
300 {
301 log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
302 }
303 }
304
305 if ( !primaryRestoredSuccessfully )
306 {
307 // Time driven mode: sleep between each round of recovery
308 // attempt.
309 try
310 {
311 log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
312 + idlePeriod + " milliseconds." );
313 Thread.sleep( idlePeriod );
314 }
315 catch ( InterruptedException ex )
316 {
317 // ignore;
318 }
319 }
320
321 // try to bring the listener back to the primary
322 }
323 while ( facade.remoteCacheAttributes.getFailoverIndex() > 0 || !alright );
324 // continue if the primary is not restored or if things are not alright.
325
326 }
327
328 /**
329 * Try to restore the primary server.
330 * <p>
331 * Once primary is restored the failover listener must be deregistered.
332 * <p>
333 * The primary server is the first server defines in the FailoverServers
334 * list.
335 *
336 * @return boolean value indicating whether the resoration was successful
337 */
338 private boolean restorePrimary()
339 {
340 // try to move back to the primary
341 String[] failovers = facade.remoteCacheAttributes.getFailovers();
342 String server = failovers[0];
343
344 if ( log.isInfoEnabled() )
345 {
346 log.info( "Trying to restore connection to primary remote server [" + server + "]" );
347 }
348
349 try
350 {
351 RemoteCacheAttributes rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
352 rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
353 rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
354 RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
355
356 // add a listener if there are none, need to tell rca what number it
357 // is at
358 ICache ic = rcm.getCache( rca.getCacheName() );
359 // by default the listener id should be 0, else it will be the
360 // listener
361 // Originally associated with the remote cache. either way is fine.
362 // We just don't want the listener id from a failover being used.
363 // If the remote server was rebooted this could be a problem if new
364 // locals were also added.
365
366 if ( ic != null )
367 {
368 if ( ic.getStatus() == CacheConstants.STATUS_ALIVE )
369 {
370 try
371 {
372 // we could have more than one listener registered right
373 // now.
374 // this will not result in a loop, only duplication
375 // stop duplicate listening.
376 if ( facade.noWaits[0] != null && facade.noWaits[0].getStatus() == CacheConstants.STATUS_ALIVE )
377 {
378 int fidx = facade.remoteCacheAttributes.getFailoverIndex();
379
380 if ( fidx > 0 )
381 {
382 String serverOld = failovers[fidx];
383
384 if ( log.isDebugEnabled() )
385 {
386 log.debug( "Failover Index = " + fidx + " the server at that index is ["
387 + serverOld + "]" );
388 }
389
390 if ( serverOld != null )
391 {
392 // create attributes that reflect the
393 // previous failed over configuration.
394 RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
395 rcaOld.setRemoteHost( serverOld.substring( 0, serverOld.indexOf( ":" ) ) );
396 rcaOld.setRemotePort( Integer.parseInt( serverOld.substring( serverOld
397 .indexOf( ":" ) + 1 ) ) );
398 RemoteCacheManager rcmOld = RemoteCacheManager.getInstance( rcaOld, cacheMgr, cacheEventLogger, elementSerializer );
399
400 if ( rcmOld != null )
401 {
402 // manager can remove by name if
403 // necessary
404 rcmOld.removeRemoteCacheListener( rcaOld );
405 }
406 if ( log.isInfoEnabled() )
407 {
408 log.info( "Successfully deregistered from FAILOVER remote server = "
409 + serverOld );
410 }
411 }
412 }
413 else if ( fidx == 0 )
414 {
415 // this should never happen. If there are no
416 // failovers this shouldn't get called.
417 if ( log.isDebugEnabled() )
418 {
419 log.debug( "No need to restore primary, it is already restored." );
420 return true;
421 }
422 }
423 else if ( fidx < 0 )
424 {
425 // this should never happen
426 log.warn( "Failover index is less than 0, this shouldn't happen" );
427 }
428 }
429 }
430 catch ( IOException e )
431 {
432 // TODO, should try again, or somehow stop the listener
433 log.error(
434 "Trouble trying to deregister old failover listener prior to restoring the primary = "
435 + server, e );
436 }
437
438 // Restore primary
439 // may need to do this more gracefully, letting the failover finish in the background
440 RemoteCacheNoWait failoverNoWait = facade.noWaits[0];
441
442 // swap in a new one
443 facade.noWaits = new RemoteCacheNoWait[1];
444 facade.noWaits[0] = (RemoteCacheNoWait) ic;
445 facade.remoteCacheAttributes.setFailoverIndex( 0 );
446
447 if ( log.isInfoEnabled() )
448 {
449 String message = "Successfully reconnected to PRIMARY remote server. Substituted primary for failoverNoWait [" + failoverNoWait + "]";
450 log.info( message );
451
452 if ( facade.getCacheEventLogger() != null )
453 {
454 facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary",
455 message );
456 }
457 }
458 return true;
459 }
460
461 // else alright
462 // if the failover index was at 0 here, we would be in a bad
463 // situation, unless there were just
464 // no failovers configured.
465 if ( log.isDebugEnabled() )
466 {
467 log.debug( "Primary server status in error, not connected." );
468 }
469 }
470 else
471 {
472 if ( log.isDebugEnabled() )
473 {
474 log.debug( "Primary server is null, not connected." );
475 }
476 }
477 }
478 catch ( NumberFormatException ex )
479 {
480 log.error( ex );
481 }
482 return false;
483 }
484
485 /**
486 * Sets the "alright" flag to false in a critical section. This flag
487 * indicates whether or not we are connected to any server at all. If we are
488 * connected to a secondary server, then alright will be true, but we will
489 * continue to try to restore the connection with the primary server.
490 * <p>
491 * The primary server is the first server defines in the FailoverServers
492 * list.
493 */
494 private void bad()
495 {
496 if ( alright )
497 {
498 synchronized ( this )
499 {
500 alright = false;
501 }
502 }
503 }
504 }