View Javadoc
1   package org.apache.commons.jcs.auxiliary.remote;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.IOException;
23  import java.util.List;
24  import java.util.ListIterator;
25  
26  import org.apache.commons.jcs.auxiliary.AbstractAuxiliaryCacheMonitor;
27  import org.apache.commons.jcs.auxiliary.remote.behavior.IRemoteCacheAttributes;
28  import org.apache.commons.jcs.engine.CacheStatus;
29  import org.apache.commons.jcs.engine.behavior.ICache;
30  
31  /**
32   * The RemoteCacheFailoverRunner tries to establish a connection with a failover
33   * server, if any are defined. Once a failover connection is made, it will
34   * attempt to replace the failover with the primary remote server.
35   * <p>
36   * It works by switching out the RemoteCacheNoWait inside the Facade.
37   * <p>
38   * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
39   * This facade is created by the RemoteCacheFactory. The factory maintains a set
40   * of managers, one for each remote server. Typically, there will only be one
41   * manager.
42   * <p>
43   * If you use multiple remote servers, you may want to set one or more as
44   * failovers. If a local cache cannot connect to the primary server, or looses
45   * its connection to the primary server, it will attempt to restore that
46   * Connection in the background. If failovers are defined, the Failover runner
47   * will try to connect to a failover until the primary is restored.
48   *
49   */
50  public class RemoteCacheFailoverRunner<K, V> extends AbstractAuxiliaryCacheMonitor
51  {
52      /** The facade returned to the composite cache. */
53      private final RemoteCacheNoWaitFacade<K, V> facade;
54  
55      /** Factory instance */
56      private final RemoteCacheFactory cacheFactory;
57  
58      /**
59       * Constructor for the RemoteCacheFailoverRunner object. This allows the
60       * FailoverRunner to modify the facade that the CompositeCache references.
61       *
62       * @param facade the facade the CompositeCache talks to.
63       * @param cacheFactory the cache factory instance
64       */
65      public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade<K, V> facade, RemoteCacheFactory cacheFactory )
66      {
67          super("JCS-RemoteCacheFailoverRunner");
68          this.facade = facade;
69          this.cacheFactory = cacheFactory;
70          setIdlePeriod(20000L);
71      }
72  
73      /**
74       * Clean up all resources before shutdown
75       */
76      @Override
77      protected void dispose()
78      {
79          // empty
80      }
81  
82      /**
83       * do actual work
84       */
85      @Override
86      protected void doWork()
87      {
88          // empty
89      }
90  
91  
92      /**
93       * Main processing method for the RemoteCacheFailoverRunner object.
94       * <p>
95       * If we do not have a connection with any failover server, this will try to
96       * connect one at a time. If no connection can be made, it goes to sleep for
97       * a while (20 seconds).
98       * <p>
99       * Once a connection with a failover is made, we will try to reconnect to
100      * the primary server.
101      * <p>
102      * The primary server is the first server defines in the FailoverServers
103      * list.
104      */
105     @Override
106     public void run()
107     {
108         // start the main work of connecting to a failover and then restoring
109         // the primary.
110         connectAndRestore();
111 
112         if ( log.isInfoEnabled() )
113         {
114             int failoverIndex = facade.getAuxiliaryCacheAttributes().getFailoverIndex();
115             log.info( "Exiting failover runner. Failover index = " + failoverIndex);
116 
117             if ( failoverIndex <= 0 )
118             {
119                 log.info( "Failover index is <= 0, meaning we are not connected to a failover server." );
120             }
121             else if ( failoverIndex > 0 )
122             {
123                 log.info( "Failover index is > 0, meaning we are connected to a failover server." );
124             }
125             // log if we are allright or not.
126         }
127     }
128 
129     /**
130      * This is the main loop. If there are failovers defined, then this will
131      * continue until the primary is re-connected. If no failovers are defined,
132      * this will exit automatically.
133      */
134     private void connectAndRestore()
135     {
136         IRemoteCacheAttributes rca0 = facade.getAuxiliaryCacheAttributes();
137 
138         do
139         {
140             log.info( "Remote cache FAILOVER RUNNING." );
141 
142             // there is no active listener
143             if ( !allright.get() )
144             {
145                 // Monitor each RemoteCacheManager instance one after the other.
146                 // Each RemoteCacheManager corresponds to one remote connection.
147                 List<RemoteLocation> failovers = rca0.getFailovers();
148                 // we should probably check to see if there are any failovers,
149                 // even though the caller
150                 // should have already.
151 
152                 if ( failovers == null )
153                 {
154                     log.warn( "Remote is misconfigured, failovers was null." );
155                     return;
156                 }
157                 else if ( failovers.size() == 1 )
158                 {
159                     // if there is only the primary, return out of this
160                     log.info( "No failovers defined, exiting failover runner." );
161                     return;
162                 }
163 
164                 int fidx = rca0.getFailoverIndex();
165                 log.debug( "fidx = " + fidx + " failovers.size = " + failovers.size() );
166 
167                 // shouldn't we see if the primary is backup?
168                 // If we don't check the primary, if it gets connected in the
169                 // background,
170                 // we will disconnect it only to put it right back
171                 ListIterator<RemoteLocation> i = failovers.listIterator(fidx); // + 1; // +1 skips the primary
172                 if ( log.isDebugEnabled() )
173                 {
174                     log.debug( "starting at failover i = " + i.nextIndex() );
175                 }
176 
177                 // try them one at a time until successful
178                 for ( ; i.hasNext() && !allright.get();)
179                 {
180                     RemoteLocation server = i.next();
181                     if ( log.isDebugEnabled() )
182                     {
183                         log.debug( "Trying server [" + server + "] at failover index i = " + i );
184                     }
185 
186                     RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
187                     rca.setRemoteLocation(server);
188                     RemoteCacheManager rcm = cacheFactory.getManager( rca );
189 
190                     if ( log.isDebugEnabled() )
191                     {
192                         log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
193                     }
194 
195                     if (rcm != null)
196                     {
197                         // add a listener if there are none, need to tell rca
198                         // what number it is at
199                         ICache<K, V> ic = rcm.getCache( rca );
200                         if ( ic.getStatus() == CacheStatus.ALIVE )
201                         {
202                             // may need to do this more gracefully
203                             log.debug( "resetting no wait" );
204                             facade.restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
205                             rca0.setFailoverIndex( i.nextIndex() );
206 
207                             if ( log.isDebugEnabled() )
208                             {
209                                 log.debug( "setting ALLRIGHT to true" );
210                                 if ( i.hasPrevious() )
211                                 {
212                                     log.debug( "Moving to Primary Recovery Mode, failover index = " + i.nextIndex() );
213                                 }
214                                 else
215                                 {
216                                     log.debug( "No need to connect to failover, the primary server is back up." );
217                                 }
218                             }
219 
220                             allright.set(true);
221 
222                             if ( log.isInfoEnabled() )
223                             {
224                                 log.info( "CONNECTED to host = [" + rca.getRemoteLocation() + "]" );
225                             }
226                         }
227                     }
228                 }
229             }
230             // end if !allright
231             // get here if while index >0 and allright, meaning that we are
232             // connected to some backup server.
233             else
234             {
235                 if ( log.isDebugEnabled() )
236                 {
237                     log.debug( "ALLRIGHT is true " );
238                 }
239                 if ( log.isInfoEnabled() )
240                 {
241                     log.info( "Failover runner is in primary recovery mode. Failover index = "
242                         + rca0.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
243                 }
244             }
245 
246             boolean primaryRestoredSuccessfully = false;
247             // if we are not connected to the primary, try.
248             if ( rca0.getFailoverIndex() > 0 )
249             {
250                 primaryRestoredSuccessfully = restorePrimary();
251                 if ( log.isDebugEnabled() )
252                 {
253                     log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
254                 }
255             }
256 
257             if ( !primaryRestoredSuccessfully )
258             {
259                 // Time driven mode: sleep between each round of recovery
260                 // attempt.
261                 try
262                 {
263                     log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
264                         + idlePeriod + " milliseconds." );
265                     Thread.sleep( idlePeriod );
266                 }
267                 catch ( InterruptedException ex )
268                 {
269                     // ignore;
270                 }
271             }
272 
273             // try to bring the listener back to the primary
274         }
275         while ( rca0.getFailoverIndex() > 0 || !allright.get() );
276         // continue if the primary is not restored or if things are not allright.
277     }
278 
279     /**
280      * Try to restore the primary server.
281      * <p>
282      * Once primary is restored the failover listener must be deregistered.
283      * <p>
284      * The primary server is the first server defines in the FailoverServers
285      * list.
286      *
287      * @return boolean value indicating whether the restoration was successful
288      */
289     private boolean restorePrimary()
290     {
291         IRemoteCacheAttributes rca0 = facade.getAuxiliaryCacheAttributes();
292         // try to move back to the primary
293         RemoteLocation server = rca0.getFailovers().get(0);
294 
295         if ( log.isInfoEnabled() )
296         {
297             log.info( "Trying to restore connection to primary remote server [" + server + "]" );
298         }
299 
300         RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
301         rca.setRemoteLocation(server);
302         RemoteCacheManager rcm = cacheFactory.getManager( rca );
303 
304         if (rcm != null)
305         {
306             // add a listener if there are none, need to tell rca what number it
307             // is at
308             ICache<K, V> ic = rcm.getCache( rca );
309             // by default the listener id should be 0, else it will be the
310             // listener
311             // Originally associated with the remote cache. either way is fine.
312             // We just don't want the listener id from a failover being used.
313             // If the remote server was rebooted this could be a problem if new
314             // locals were also added.
315 
316             if ( ic.getStatus() == CacheStatus.ALIVE )
317             {
318                 try
319                 {
320                     // we could have more than one listener registered right
321                     // now.
322                     // this will not result in a loop, only duplication
323                     // stop duplicate listening.
324                     if ( facade.getPrimaryServer() != null && facade.getPrimaryServer().getStatus() == CacheStatus.ALIVE )
325                     {
326                         int fidx = rca0.getFailoverIndex();
327 
328                         if ( fidx > 0 )
329                         {
330                             RemoteLocation serverOld = rca0.getFailovers().get(fidx);
331 
332                             if ( log.isDebugEnabled() )
333                             {
334                                 log.debug( "Failover Index = " + fidx + " the server at that index is ["
335                                     + serverOld + "]" );
336                             }
337 
338                             if ( serverOld != null )
339                             {
340                                 // create attributes that reflect the
341                                 // previous failed over configuration.
342                                 RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) rca0.clone();
343                                 rcaOld.setRemoteLocation(serverOld);
344                                 RemoteCacheManager rcmOld = cacheFactory.getManager( rcaOld );
345 
346                                 if ( rcmOld != null )
347                                 {
348                                     // manager can remove by name if
349                                     // necessary
350                                     rcmOld.removeRemoteCacheListener( rcaOld );
351                                 }
352                                 if ( log.isInfoEnabled() )
353                                 {
354                                     log.info( "Successfully deregistered from FAILOVER remote server = "
355                                         + serverOld );
356                                 }
357                             }
358                         }
359                         else if ( fidx == 0 )
360                         {
361                             // this should never happen. If there are no
362                             // failovers this shouldn't get called.
363                             if ( log.isDebugEnabled() )
364                             {
365                                 log.debug( "No need to restore primary, it is already restored." );
366                                 return true;
367                             }
368                         }
369                         else if ( fidx < 0 )
370                         {
371                             // this should never happen
372                             log.warn( "Failover index is less than 0, this shouldn't happen" );
373                         }
374                     }
375                 }
376                 catch ( IOException e )
377                 {
378                     // TODO, should try again, or somehow stop the listener
379                     log.error("Trouble trying to deregister old failover listener prior to restoring the primary = "
380                            + server, e );
381                 }
382 
383                 // Restore primary
384                 // may need to do this more gracefully, letting the failover finish in the background
385                 RemoteCacheNoWait<K, V> failoverNoWait = facade.getPrimaryServer();
386 
387                 // swap in a new one
388                 facade.restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
389                 rca0.setFailoverIndex( 0 );
390 
391                 if ( log.isInfoEnabled() )
392                 {
393                     String message = "Successfully reconnected to PRIMARY remote server.  Substituted primary for failoverNoWait ["
394                         + failoverNoWait + "]";
395                     log.info( message );
396 
397                     if ( facade.getCacheEventLogger() != null )
398                     {
399                         facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary", message );
400                     }
401                 }
402                 return true;
403             }
404         }
405 
406         // else all right
407         // if the failover index was at 0 here, we would be in a bad
408         // situation, unless there were just
409         // no failovers configured.
410         if ( log.isDebugEnabled() )
411         {
412             log.debug( "Primary server status in error, not connected." );
413         }
414 
415         return false;
416     }
417 }