View Javadoc

1   package org.apache.jcs.auxiliary.remote;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.IOException;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.jcs.engine.CacheConstants;
27  import org.apache.jcs.engine.behavior.ICache;
28  import org.apache.jcs.engine.behavior.ICompositeCacheManager;
29  import org.apache.jcs.engine.behavior.IElementSerializer;
30  import org.apache.jcs.engine.logging.behavior.ICacheEventLogger;
31  
32  /**
33   * The RemoteCacheFailoverRunner tries to establish a connection with a failover
34   * server, if any are defined. Once a failover connection is made, it will
35   * attempt to replace the failover with the primary remote server.
36   * <p>
37   * It works by switching out the RemoteCacheNoWait inside the Facade.
38   * <p>
39   * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
40   * This facade is created by the RemoteCacheFactory. The factory maintains a set
41   * of managers, one for each remote server. Typically, there will only be one
42   * manager.
43   * <p>
44   * If you use multiple remote servers, you may want to set one or more as
45   * failovers. If a local cache cannot connect to the primary server, or looses
46   * its connection to the primary server, it will attempt to restore that
47   * Connection in the background. If failovers are defined, the Failover runner
48   * will try to connect to a failover until the primary is restored.
49   *
50   */
51  public class RemoteCacheFailoverRunner
52      implements Runnable
53  {
54      /** The logger */
55      private final static Log log = LogFactory.getLog( RemoteCacheFailoverRunner.class );
56  
57      /** The facade returned to the composite cache. */
58      private final RemoteCacheNoWaitFacade facade;
59  
60      /** How long to wait between reconnect attempts. */
61      private static long idlePeriod = 20 * 1000;
62  
63      /** Have we reconnected. */
64      private boolean alright = true;
65  
66      /** The cache manager */
67      private final ICompositeCacheManager cacheMgr;
68  
69      /** The event logger. */
70      private final ICacheEventLogger cacheEventLogger;
71  
72      /** The serializer. */
73      private final IElementSerializer elementSerializer;
74  
75      /**
76       * Constructor for the RemoteCacheFailoverRunner object. This allows the
77       * FailoverRunner to modify the facade that the CompositeCache references.
78       *
79       * @param facade
80       *            the facade the CompositeCache talks to.
81       * @param cacheMgr
82       * @param cacheEventLogger
83       * @param elementSerializer
84       */
85      public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade facade, ICompositeCacheManager cacheMgr,
86                                        ICacheEventLogger cacheEventLogger, IElementSerializer elementSerializer )
87      {
88          this.facade = facade;
89          this.cacheMgr = cacheMgr;
90          this.cacheEventLogger = cacheEventLogger;
91          this.elementSerializer = elementSerializer;
92      }
93  
94      /**
95       * Notifies the cache monitor that an error occurred, and kicks off the
96       * error recovery process.
97       */
98      public void notifyError()
99      {
100         bad();
101         synchronized ( this )
102         {
103             notify();
104         }
105     }
106 
107     /**
108      * Main processing method for the RemoteCacheFailoverRunner object.
109      * <p>
110      * If we do not have a connection with any failover server, this will try to
111      * connect one at a time. If no connection can be made, it goes to sleep for
112      * a while (20 seconds).
113      * <p>
114      * Once a connection with a failover is made, we will try to reconnect to
115      * the primary server.
116      * <p>
117      * The primary server is the first server defines in the FailoverServers
118      * list.
119      */
120     public void run()
121     {
122         // start the main work of connecting to a failover and then restoring
123         // the primary.
124         connectAndRestore();
125 
126         if ( log.isInfoEnabled() )
127         {
128             log.info( "Exiting failover runner. Failover index = " + facade.remoteCacheAttributes.getFailoverIndex() );
129             if ( facade.remoteCacheAttributes.getFailoverIndex() <= 0 )
130             {
131                 log.info( "Failover index is <= 0, meaning we are not " + "connected to a failover server." );
132             }
133             else if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
134             {
135                 log.info( "Failover index is > 0, meaning we are " + "connected to a failover server." );
136             }
137             // log if we are alright or not.
138         }
139     }
140 
141     /**
142      * This is the main loop. If there are failovers defined, then this will
143      * continue until the primary is re-connected. If no failovers are defined,
144      * this will exit automatically.
145      */
146     private void connectAndRestore()
147     {
148         do
149         {
150             log.info( "Remote cache FAILOVER RUNNING." );
151 
152             // there is no active listener
153             if ( !alright )
154             {
155                 // Monitor each RemoteCacheManager instance one after the other.
156                 // Each RemoteCacheManager corresponds to one remote connection.
157                 String[] failovers = facade.remoteCacheAttributes.getFailovers();
158                 // we should probably check to see if there are any failovers,
159                 // even though the caller
160                 // should have already.
161 
162                 if ( failovers == null )
163                 {
164                     log.warn( "Remote is misconfigured, failovers was null." );
165                     return;
166                 }
167                 else if ( failovers.length == 1 )
168                 {
169                     // if there is only the primary, return out of this
170                     if ( log.isInfoEnabled() )
171                     {
172                         log.info( "No failovers defined, exiting failover runner." );
173                         return;
174                     }
175                 }
176 
177                 int fidx = facade.remoteCacheAttributes.getFailoverIndex();
178                 log.debug( "fidx = " + fidx + " failovers.length = " + failovers.length );
179 
180                 // shouldn't we see if the primary is backup?
181                 // If we don't check the primary, if it gets connected in the
182                 // background,
183                 // we will disconnect it only to put it right back
184                 int i = fidx; // + 1; // +1 skips the primary
185                 if ( log.isDebugEnabled() )
186                 {
187                     log.debug( "stating at failover i = " + i );
188                 }
189 
190                 // try them one at a time until successful
191                 for ( ; i < failovers.length && !alright; i++ )
192                 {
193                     String server = failovers[i];
194                     if ( log.isDebugEnabled() )
195                     {
196                         log.debug( "Trying server [" + server + "] at failover index i = " + i );
197                     }
198 
199                     RemoteCacheAttributes rca = null;
200                     try
201                     {
202                         rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
203                         rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
204                         rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
205                         RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
206 
207                         if ( log.isDebugEnabled() )
208                         {
209                             log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
210                         }
211 
212                         // add a listener if there are none, need to tell rca
213                         // what number it is at
214                         ICache ic = rcm.getCache( rca.getCacheName() );
215                         if ( ic != null )
216                         {
217                             if ( ic.getStatus() == CacheConstants.STATUS_ALIVE )
218                             {
219                                 // may need to do this more gracefully
220                                 log.debug( "reseting no wait" );
221                                 facade.noWaits = new RemoteCacheNoWait[1];
222                                 facade.noWaits[0] = (RemoteCacheNoWait) ic;
223                                 facade.remoteCacheAttributes.setFailoverIndex( i );
224 
225                                 synchronized ( this )
226                                 {
227                                     if ( log.isDebugEnabled() )
228                                     {
229                                         log.debug( "setting ALRIGHT to true" );
230                                         if ( i > 0 )
231                                         {
232                                             log.debug( "Moving to Primary Recovery Mode, failover index = " + i );
233                                         }
234                                         else
235                                         {
236                                             if ( log.isInfoEnabled() )
237                                             {
238                                                 String message = "No need to connect to failover, the primary server is back up.";
239                                                 log.info( message );
240                                             }
241                                         }
242                                     }
243 
244                                     alright = true;
245 
246                                     if ( log.isInfoEnabled() )
247                                     {
248                                         log.info( "CONNECTED to host = [" + rca.getRemoteHost() + "] port = ["
249                                             + rca.getRemotePort() + "]" );
250                                     }
251                                 }
252                             }
253                         }
254                         else
255                         {
256                             log.info( "noWait is null" );
257                         }
258                     }
259                     catch ( Exception ex )
260                     {
261                         bad();
262                         // Problem encountered in fixing the caches managed by a
263                         // RemoteCacheManager instance.
264                         // Soldier on to the next RemoteCacheManager instance.
265                         if ( i == 0 )
266                         {
267                             log.warn( "FAILED to connect, as expected, to primary" + rca.getRemoteHost() + ":"
268                                 + rca.getRemotePort(), ex );
269                         }
270                         else
271                         {
272                             log.error( "FAILED to connect to failover [" + rca.getRemoteHost() + ":"
273                                 + rca.getRemotePort() + "]", ex );
274                         }
275                     }
276                 }
277             }
278             // end if !alright
279             // get here if while index >0 and alright, meaning that we are
280             // connected to some backup server.
281             else
282             {
283                 if ( log.isDebugEnabled() )
284                 {
285                     log.debug( "ALRIGHT is true " );
286                 }
287                 if ( log.isInfoEnabled() )
288                 {
289                     log.info( "Failover runner is in primary recovery mode. Failover index = "
290                         + facade.remoteCacheAttributes.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
291                 }
292             }
293 
294             boolean primaryRestoredSuccessfully = false;
295             // if we are not connected to the primary, try.
296             if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
297             {
298                 primaryRestoredSuccessfully = restorePrimary();
299                 if ( log.isDebugEnabled() )
300                 {
301                     log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
302                 }
303             }
304 
305             if ( !primaryRestoredSuccessfully )
306             {
307                 // Time driven mode: sleep between each round of recovery
308                 // attempt.
309                 try
310                 {
311                     log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
312                         + idlePeriod + " milliseconds." );
313                     Thread.sleep( idlePeriod );
314                 }
315                 catch ( InterruptedException ex )
316                 {
317                     // ignore;
318                 }
319             }
320 
321             // try to bring the listener back to the primary
322         }
323         while ( facade.remoteCacheAttributes.getFailoverIndex() > 0 || !alright );
324         // continue if the primary is not restored or if things are not alright.
325 
326     }
327 
328     /**
329      * Try to restore the primary server.
330      * <p>
331      * Once primary is restored the failover listener must be deregistered.
332      * <p>
333      * The primary server is the first server defines in the FailoverServers
334      * list.
335      *
336      * @return boolean value indicating whether the resoration was successful
337      */
338     private boolean restorePrimary()
339     {
340         // try to move back to the primary
341         String[] failovers = facade.remoteCacheAttributes.getFailovers();
342         String server = failovers[0];
343 
344         if ( log.isInfoEnabled() )
345         {
346             log.info( "Trying to restore connection to primary remote server [" + server + "]" );
347         }
348 
349         try
350         {
351             RemoteCacheAttributes rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
352             rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
353             rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
354             RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
355 
356             // add a listener if there are none, need to tell rca what number it
357             // is at
358             ICache ic = rcm.getCache( rca.getCacheName() );
359             // by default the listener id should be 0, else it will be the
360             // listener
361             // Originally associated with the remote cache. either way is fine.
362             // We just don't want the listener id from a failover being used.
363             // If the remote server was rebooted this could be a problem if new
364             // locals were also added.
365 
366             if ( ic != null )
367             {
368                 if ( ic.getStatus() == CacheConstants.STATUS_ALIVE )
369                 {
370                     try
371                     {
372                         // we could have more than one listener registered right
373                         // now.
374                         // this will not result in a loop, only duplication
375                         // stop duplicate listening.
376                         if ( facade.noWaits[0] != null && facade.noWaits[0].getStatus() == CacheConstants.STATUS_ALIVE )
377                         {
378                             int fidx = facade.remoteCacheAttributes.getFailoverIndex();
379 
380                             if ( fidx > 0 )
381                             {
382                                 String serverOld = failovers[fidx];
383 
384                                 if ( log.isDebugEnabled() )
385                                 {
386                                     log.debug( "Failover Index = " + fidx + " the server at that index is ["
387                                         + serverOld + "]" );
388                                 }
389 
390                                 if ( serverOld != null )
391                                 {
392                                     // create attributes that reflect the
393                                     // previous failed over configuration.
394                                     RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
395                                     rcaOld.setRemoteHost( serverOld.substring( 0, serverOld.indexOf( ":" ) ) );
396                                     rcaOld.setRemotePort( Integer.parseInt( serverOld.substring( serverOld
397                                         .indexOf( ":" ) + 1 ) ) );
398                                     RemoteCacheManager rcmOld = RemoteCacheManager.getInstance( rcaOld, cacheMgr, cacheEventLogger, elementSerializer );
399 
400                                     if ( rcmOld != null )
401                                     {
402                                         // manager can remove by name if
403                                         // necessary
404                                         rcmOld.removeRemoteCacheListener( rcaOld );
405                                     }
406                                     if ( log.isInfoEnabled() )
407                                     {
408                                         log.info( "Successfully deregistered from FAILOVER remote server = "
409                                             + serverOld );
410                                     }
411                                 }
412                             }
413                             else if ( fidx == 0 )
414                             {
415                                 // this should never happen. If there are no
416                                 // failovers this shouldn't get called.
417                                 if ( log.isDebugEnabled() )
418                                 {
419                                     log.debug( "No need to restore primary, it is already restored." );
420                                     return true;
421                                 }
422                             }
423                             else if ( fidx < 0 )
424                             {
425                                 // this should never happen
426                                 log.warn( "Failover index is less than 0, this shouldn't happen" );
427                             }
428                         }
429                     }
430                     catch ( IOException e )
431                     {
432                         // TODO, should try again, or somehow stop the listener
433                         log.error(
434                                    "Trouble trying to deregister old failover listener prior to restoring the primary = "
435                                        + server, e );
436                     }
437 
438                     // Restore primary
439                     // may need to do this more gracefully, letting the failover finish in the background
440                     RemoteCacheNoWait failoverNoWait = facade.noWaits[0];
441 
442                     // swap in a new one
443                     facade.noWaits = new RemoteCacheNoWait[1];
444                     facade.noWaits[0] = (RemoteCacheNoWait) ic;
445                     facade.remoteCacheAttributes.setFailoverIndex( 0 );
446 
447                     if ( log.isInfoEnabled() )
448                     {
449                         String message = "Successfully reconnected to PRIMARY remote server.  Substituted primary for failoverNoWait [" + failoverNoWait + "]";
450                         log.info( message );
451 
452                         if ( facade.getCacheEventLogger() != null )
453                         {
454                             facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary",
455                                                                               message );
456                         }
457                     }
458                     return true;
459                 }
460 
461                 // else alright
462                 // if the failover index was at 0 here, we would be in a bad
463                 // situation, unless there were just
464                 // no failovers configured.
465                 if ( log.isDebugEnabled() )
466                 {
467                     log.debug( "Primary server status in error, not connected." );
468                 }
469             }
470             else
471             {
472                 if ( log.isDebugEnabled() )
473                 {
474                     log.debug( "Primary server is null, not connected." );
475                 }
476             }
477         }
478         catch ( NumberFormatException ex )
479         {
480             log.error( ex );
481         }
482         return false;
483     }
484 
485     /**
486      * Sets the "alright" flag to false in a critical section. This flag
487      * indicates whether or not we are connected to any server at all. If we are
488      * connected to a secondary server, then alright will be true, but we will
489      * continue to try to restore the connection with the primary server.
490      * <p>
491      * The primary server is the first server defines in the FailoverServers
492      * list.
493      */
494     private void bad()
495     {
496         if ( alright )
497         {
498             synchronized ( this )
499             {
500                 alright = false;
501             }
502         }
503     }
504 }