001 package org.apache.jcs.auxiliary.remote;
002
003 /*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements. See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership. The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License. You may obtain a copy of the License at
011 *
012 * http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied. See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022 import java.io.IOException;
023 import java.io.Serializable;
024
025 import org.apache.commons.logging.Log;
026 import org.apache.commons.logging.LogFactory;
027 import org.apache.jcs.engine.CacheStatus;
028 import org.apache.jcs.engine.behavior.ICache;
029 import org.apache.jcs.engine.behavior.ICompositeCacheManager;
030 import org.apache.jcs.engine.behavior.IElementSerializer;
031 import org.apache.jcs.engine.logging.behavior.ICacheEventLogger;
032
033 /**
034 * The RemoteCacheFailoverRunner tries to establish a connection with a failover
035 * server, if any are defined. Once a failover connection is made, it will
036 * attempt to replace the failover with the primary remote server.
037 * <p>
038 * It works by switching out the RemoteCacheNoWait inside the Facade.
039 * <p>
040 * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
041 * This facade is created by the RemoteCacheFactory. The factory maintains a set
042 * of managers, one for each remote server. Typically, there will only be one
043 * manager.
044 * <p>
045 * If you use multiple remote servers, you may want to set one or more as
046 * failovers. If a local cache cannot connect to the primary server, or looses
047 * its connection to the primary server, it will attempt to restore that
048 * Connection in the background. If failovers are defined, the Failover runner
049 * will try to connect to a failover until the primary is restored.
050 *
051 */
052 public class RemoteCacheFailoverRunner<K extends Serializable, V extends Serializable>
053 implements Runnable
054 {
055 /** The logger */
056 private final static Log log = LogFactory.getLog( RemoteCacheFailoverRunner.class );
057
058 /** The facade returned to the composite cache. */
059 private final RemoteCacheNoWaitFacade<K, V> facade;
060
061 /** How long to wait between reconnect attempts. */
062 private static long idlePeriod = 20 * 1000;
063
064 /** Have we reconnected. */
065 private boolean alright = true;
066
067 /** The cache manager */
068 private final ICompositeCacheManager cacheMgr;
069
070 /** The event logger. */
071 private final ICacheEventLogger cacheEventLogger;
072
073 /** The serializer. */
074 private final IElementSerializer elementSerializer;
075
076 /**
077 * Constructor for the RemoteCacheFailoverRunner object. This allows the
078 * FailoverRunner to modify the facade that the CompositeCache references.
079 *
080 * @param facade
081 * the facade the CompositeCache talks to.
082 * @param cacheMgr
083 * @param cacheEventLogger
084 * @param elementSerializer
085 */
086 public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade<K, V> facade, ICompositeCacheManager cacheMgr,
087 ICacheEventLogger cacheEventLogger, IElementSerializer elementSerializer )
088 {
089 this.facade = facade;
090 this.cacheMgr = cacheMgr;
091 this.cacheEventLogger = cacheEventLogger;
092 this.elementSerializer = elementSerializer;
093 }
094
095 /**
096 * Notifies the cache monitor that an error occurred, and kicks off the
097 * error recovery process.
098 */
099 public void notifyError()
100 {
101 bad();
102 synchronized ( this )
103 {
104 notify();
105 }
106 }
107
108 /**
109 * Main processing method for the RemoteCacheFailoverRunner object.
110 * <p>
111 * If we do not have a connection with any failover server, this will try to
112 * connect one at a time. If no connection can be made, it goes to sleep for
113 * a while (20 seconds).
114 * <p>
115 * Once a connection with a failover is made, we will try to reconnect to
116 * the primary server.
117 * <p>
118 * The primary server is the first server defines in the FailoverServers
119 * list.
120 */
121 public void run()
122 {
123 // start the main work of connecting to a failover and then restoring
124 // the primary.
125 connectAndRestore();
126
127 if ( log.isInfoEnabled() )
128 {
129 log.info( "Exiting failover runner. Failover index = " + facade.remoteCacheAttributes.getFailoverIndex() );
130 if ( facade.remoteCacheAttributes.getFailoverIndex() <= 0 )
131 {
132 log.info( "Failover index is <= 0, meaning we are not " + "connected to a failover server." );
133 }
134 else if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
135 {
136 log.info( "Failover index is > 0, meaning we are " + "connected to a failover server." );
137 }
138 // log if we are alright or not.
139 }
140 }
141
142 /**
143 * This is the main loop. If there are failovers defined, then this will
144 * continue until the primary is re-connected. If no failovers are defined,
145 * this will exit automatically.
146 */
147 @SuppressWarnings("unchecked") // No generic arrays in java
148 private void connectAndRestore()
149 {
150 do
151 {
152 log.info( "Remote cache FAILOVER RUNNING." );
153
154 // there is no active listener
155 if ( !alright )
156 {
157 // Monitor each RemoteCacheManager instance one after the other.
158 // Each RemoteCacheManager corresponds to one remote connection.
159 String[] failovers = facade.remoteCacheAttributes.getFailovers();
160 // we should probably check to see if there are any failovers,
161 // even though the caller
162 // should have already.
163
164 if ( failovers == null )
165 {
166 log.warn( "Remote is misconfigured, failovers was null." );
167 return;
168 }
169 else if ( failovers.length == 1 )
170 {
171 // if there is only the primary, return out of this
172 if ( log.isInfoEnabled() )
173 {
174 log.info( "No failovers defined, exiting failover runner." );
175 return;
176 }
177 }
178
179 int fidx = facade.remoteCacheAttributes.getFailoverIndex();
180 log.debug( "fidx = " + fidx + " failovers.length = " + failovers.length );
181
182 // shouldn't we see if the primary is backup?
183 // If we don't check the primary, if it gets connected in the
184 // background,
185 // we will disconnect it only to put it right back
186 int i = fidx; // + 1; // +1 skips the primary
187 if ( log.isDebugEnabled() )
188 {
189 log.debug( "stating at failover i = " + i );
190 }
191
192 // try them one at a time until successful
193 for ( ; i < failovers.length && !alright; i++ )
194 {
195 String server = failovers[i];
196 if ( log.isDebugEnabled() )
197 {
198 log.debug( "Trying server [" + server + "] at failover index i = " + i );
199 }
200
201 RemoteCacheAttributes rca = null;
202 try
203 {
204 rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
205 rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
206 rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
207 RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
208
209 if ( log.isDebugEnabled() )
210 {
211 log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
212 }
213
214 // add a listener if there are none, need to tell rca
215 // what number it is at
216 ICache<K, V> ic = rcm.getCache( rca.getCacheName() );
217 if ( ic != null )
218 {
219 if ( ic.getStatus() == CacheStatus.ALIVE )
220 {
221 // may need to do this more gracefully
222 log.debug( "resetting no wait" );
223 facade.noWaits = new RemoteCacheNoWait[1];
224 facade.noWaits[0] = (RemoteCacheNoWait<K, V>) ic;
225 facade.remoteCacheAttributes.setFailoverIndex( i );
226
227 synchronized ( this )
228 {
229 if ( log.isDebugEnabled() )
230 {
231 log.debug( "setting ALRIGHT to true" );
232 if ( i > 0 )
233 {
234 log.debug( "Moving to Primary Recovery Mode, failover index = " + i );
235 }
236 else
237 {
238 if ( log.isInfoEnabled() )
239 {
240 String message = "No need to connect to failover, the primary server is back up.";
241 log.info( message );
242 }
243 }
244 }
245
246 alright = true;
247
248 if ( log.isInfoEnabled() )
249 {
250 log.info( "CONNECTED to host = [" + rca.getRemoteHost() + "] port = ["
251 + rca.getRemotePort() + "]" );
252 }
253 }
254 }
255 }
256 else
257 {
258 log.info( "noWait is null" );
259 }
260 }
261 catch ( Exception ex )
262 {
263 bad();
264 // Problem encountered in fixing the caches managed by a
265 // RemoteCacheManager instance.
266 // Soldier on to the next RemoteCacheManager instance.
267 String remoteHost = (rca == null) ? "null" : rca.getRemoteHost();
268 int remotePort = (rca == null) ? 0 : rca.getRemotePort();
269 if ( i == 0 )
270 {
271 log.warn( "FAILED to connect, as expected, to primary" + remoteHost + ":"
272 + remotePort, ex );
273 }
274 else
275 {
276 log.error( "FAILED to connect to failover [" + remoteHost + ":"
277 + remotePort + "]", ex );
278 }
279 }
280 }
281 }
282 // end if !alright
283 // get here if while index >0 and alright, meaning that we are
284 // connected to some backup server.
285 else
286 {
287 if ( log.isDebugEnabled() )
288 {
289 log.debug( "ALRIGHT is true " );
290 }
291 if ( log.isInfoEnabled() )
292 {
293 log.info( "Failover runner is in primary recovery mode. Failover index = "
294 + facade.remoteCacheAttributes.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
295 }
296 }
297
298 boolean primaryRestoredSuccessfully = false;
299 // if we are not connected to the primary, try.
300 if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
301 {
302 primaryRestoredSuccessfully = restorePrimary();
303 if ( log.isDebugEnabled() )
304 {
305 log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
306 }
307 }
308
309 if ( !primaryRestoredSuccessfully )
310 {
311 // Time driven mode: sleep between each round of recovery
312 // attempt.
313 try
314 {
315 log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
316 + idlePeriod + " milliseconds." );
317 Thread.sleep( idlePeriod );
318 }
319 catch ( InterruptedException ex )
320 {
321 // ignore;
322 }
323 }
324
325 // try to bring the listener back to the primary
326 }
327 while ( facade.remoteCacheAttributes.getFailoverIndex() > 0 || !alright );
328 // continue if the primary is not restored or if things are not alright.
329
330 }
331
332 /**
333 * Try to restore the primary server.
334 * <p>
335 * Once primary is restored the failover listener must be deregistered.
336 * <p>
337 * The primary server is the first server defines in the FailoverServers
338 * list.
339 *
340 * @return boolean value indicating whether the restoration was successful
341 */
342 @SuppressWarnings("unchecked") // No generic arrays in java
343 private boolean restorePrimary()
344 {
345 // try to move back to the primary
346 String[] failovers = facade.remoteCacheAttributes.getFailovers();
347 String server = failovers[0];
348
349 if ( log.isInfoEnabled() )
350 {
351 log.info( "Trying to restore connection to primary remote server [" + server + "]" );
352 }
353
354 try
355 {
356 RemoteCacheAttributes rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
357 rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
358 rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
359 RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
360
361 // add a listener if there are none, need to tell rca what number it
362 // is at
363 ICache<K, V> ic = rcm.getCache( rca.getCacheName() );
364 // by default the listener id should be 0, else it will be the
365 // listener
366 // Originally associated with the remote cache. either way is fine.
367 // We just don't want the listener id from a failover being used.
368 // If the remote server was rebooted this could be a problem if new
369 // locals were also added.
370
371 if ( ic != null )
372 {
373 if ( ic.getStatus() == CacheStatus.ALIVE )
374 {
375 try
376 {
377 // we could have more than one listener registered right
378 // now.
379 // this will not result in a loop, only duplication
380 // stop duplicate listening.
381 if ( facade.noWaits[0] != null && facade.noWaits[0].getStatus() == CacheStatus.ALIVE )
382 {
383 int fidx = facade.remoteCacheAttributes.getFailoverIndex();
384
385 if ( fidx > 0 )
386 {
387 String serverOld = failovers[fidx];
388
389 if ( log.isDebugEnabled() )
390 {
391 log.debug( "Failover Index = " + fidx + " the server at that index is ["
392 + serverOld + "]" );
393 }
394
395 if ( serverOld != null )
396 {
397 // create attributes that reflect the
398 // previous failed over configuration.
399 RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
400 rcaOld.setRemoteHost( serverOld.substring( 0, serverOld.indexOf( ":" ) ) );
401 rcaOld.setRemotePort( Integer.parseInt( serverOld.substring( serverOld
402 .indexOf( ":" ) + 1 ) ) );
403 RemoteCacheManager rcmOld = RemoteCacheManager.getInstance( rcaOld, cacheMgr, cacheEventLogger, elementSerializer );
404
405 if ( rcmOld != null )
406 {
407 // manager can remove by name if
408 // necessary
409 rcmOld.removeRemoteCacheListener( rcaOld );
410 }
411 if ( log.isInfoEnabled() )
412 {
413 log.info( "Successfully deregistered from FAILOVER remote server = "
414 + serverOld );
415 }
416 }
417 }
418 else if ( fidx == 0 )
419 {
420 // this should never happen. If there are no
421 // failovers this shouldn't get called.
422 if ( log.isDebugEnabled() )
423 {
424 log.debug( "No need to restore primary, it is already restored." );
425 return true;
426 }
427 }
428 else if ( fidx < 0 )
429 {
430 // this should never happen
431 log.warn( "Failover index is less than 0, this shouldn't happen" );
432 }
433 }
434 }
435 catch ( IOException e )
436 {
437 // TODO, should try again, or somehow stop the listener
438 log.error(
439 "Trouble trying to deregister old failover listener prior to restoring the primary = "
440 + server, e );
441 }
442
443 // Restore primary
444 // may need to do this more gracefully, letting the failover finish in the background
445 RemoteCacheNoWait<K, V> failoverNoWait = facade.noWaits[0];
446
447 // swap in a new one
448 facade.noWaits = new RemoteCacheNoWait[1];
449 facade.noWaits[0] = (RemoteCacheNoWait<K, V>) ic;
450 facade.remoteCacheAttributes.setFailoverIndex( 0 );
451
452 if ( log.isInfoEnabled() )
453 {
454 String message = "Successfully reconnected to PRIMARY remote server. Substituted primary for failoverNoWait [" + failoverNoWait + "]";
455 log.info( message );
456
457 if ( facade.getCacheEventLogger() != null )
458 {
459 facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary",
460 message );
461 }
462 }
463 return true;
464 }
465
466 // else all right
467 // if the failover index was at 0 here, we would be in a bad
468 // situation, unless there were just
469 // no failovers configured.
470 if ( log.isDebugEnabled() )
471 {
472 log.debug( "Primary server status in error, not connected." );
473 }
474 }
475 else
476 {
477 if ( log.isDebugEnabled() )
478 {
479 log.debug( "Primary server is null, not connected." );
480 }
481 }
482 }
483 catch ( NumberFormatException ex )
484 {
485 log.error( ex );
486 }
487 return false;
488 }
489
490 /**
491 * Sets the "alright" flag to false in a critical section. This flag
492 * indicates whether or not we are connected to any server at all. If we are
493 * connected to a secondary server, then alright will be true, but we will
494 * continue to try to restore the connection with the primary server.
495 * <p>
496 * The primary server is the first server defines in the FailoverServers
497 * list.
498 */
499 private void bad()
500 {
501 if ( alright )
502 {
503 synchronized ( this )
504 {
505 alright = false;
506 }
507 }
508 }
509 }