View Javadoc
1   package org.apache.commons.jcs3.auxiliary.remote;
2   
3   import java.io.IOException;
4   
5   /*
6    * Licensed to the Apache Software Foundation (ASF) under one
7    * or more contributor license agreements.  See the NOTICE file
8    * distributed with this work for additional information
9    * regarding copyright ownership.  The ASF licenses this file
10   * to you under the Apache License, Version 2.0 (the
11   * "License"); you may not use this file except in compliance
12   * with the License.  You may obtain a copy of the License at
13   *
14   *   http://www.apache.org/licenses/LICENSE-2.0
15   *
16   * Unless required by applicable law or agreed to in writing,
17   * software distributed under the License is distributed on an
18   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
19   * KIND, either express or implied.  See the License for the
20   * specific language governing permissions and limitations
21   * under the License.
22   */
23  
24  import java.util.List;
25  import java.util.ListIterator;
26  import java.util.concurrent.atomic.AtomicBoolean;
27  
28  import org.apache.commons.jcs3.auxiliary.remote.behavior.IRemoteCacheAttributes;
29  import org.apache.commons.jcs3.auxiliary.remote.server.behavior.RemoteType;
30  import org.apache.commons.jcs3.engine.CacheStatus;
31  import org.apache.commons.jcs3.engine.behavior.ICache;
32  import org.apache.commons.jcs3.engine.behavior.IElementSerializer;
33  import org.apache.commons.jcs3.engine.logging.behavior.ICacheEventLogger;
34  import org.apache.commons.jcs3.log.Log;
35  import org.apache.commons.jcs3.log.LogManager;
36  
37  /**
38   * Used to provide access to multiple services under nowait protection. Factory should construct
39   * NoWaitFacade to give to the composite cache out of caches it constructs from the varies manager
40   * to lateral services.
41   * <p>
42   * Typically, we only connect to one remote server per facade. We use a list of one
43   * RemoteCacheNoWait.
44   */
45  public class RemoteCacheNoWaitFacade<K, V>
46      extends AbstractRemoteCacheNoWaitFacade<K, V>
47  {
48      /** log instance */
49      private static final Log log = LogManager.getLog( RemoteCacheNoWaitFacade.class );
50  
51      /** Provide factory instance to RemoteCacheFailoverRunner */
52      private final RemoteCacheFactory cacheFactory;
53  
54      /** Attempt to restore primary connection (switched off for testing) */
55      protected boolean attemptRestorePrimary = true;
56  
57      /** Time in ms to sleep between failover attempts */
58      private static final long idlePeriod = 20000L;
59  
60      /**
61       * Constructs with the given remote cache, and fires events to any listeners.
62       * <p>
63       * @param noWaits
64       * @param rca
65       * @param cacheEventLogger
66       * @param elementSerializer
67       * @param cacheFactory
68       */
69      public RemoteCacheNoWaitFacade( final List<RemoteCacheNoWait<K,V>> noWaits,
70                                      final IRemoteCacheAttributes rca,
71                                      final ICacheEventLogger cacheEventLogger,
72                                      final IElementSerializer elementSerializer,
73                                      final RemoteCacheFactory cacheFactory)
74      {
75          super( noWaits, rca, cacheEventLogger, elementSerializer );
76          this.cacheFactory = cacheFactory;
77      }
78  
79      /**
80       * Begin the failover process if this is a local cache. Clustered remote caches do not failover.
81       * <p>
82       * @param rcnw The no wait in error.
83       */
84      @Override
85      protected void failover( final RemoteCacheNoWait<K, V> rcnw )
86      {
87          log.debug( "in failover for {0}", rcnw );
88  
89          if ( getAuxiliaryCacheAttributes().getRemoteType() == RemoteType.LOCAL )
90          {
91              if ( rcnw.getStatus() == CacheStatus.ERROR )
92              {
93                  // start failover, primary recovery process
94                  final Thread runner = new Thread(this::connectAndRestore);
95                  runner.setDaemon( true );
96                  runner.start();
97  
98                  if ( getCacheEventLogger() != null )
99                  {
100                     getCacheEventLogger().logApplicationEvent( "RemoteCacheNoWaitFacade", "InitiatedFailover",
101                                                                rcnw + " was in error." );
102                 }
103             }
104             else
105             {
106                 log.info( "The noWait is not in error" );
107             }
108         }
109     }
110 
111     /**
112      * The thread tries to establish a connection with a failover
113      * server, if any are defined. Once a failover connection is made, it will
114      * attempt to replace the failover with the primary remote server.
115      * <p>
116      * It works by switching out the RemoteCacheNoWait inside the Facade.
117      * <p>
118      * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
119      * This facade is created by the RemoteCacheFactory. The factory maintains a set
120      * of managers, one for each remote server. Typically, there will only be one
121      * manager.
122      * <p>
123      * If you use multiple remote servers, you may want to set one or more as
124      * failovers. If a local cache cannot connect to the primary server, or looses
125      * its connection to the primary server, it will attempt to restore that
126      * Connection in the background. If failovers are defined, the Failover runner
127      * will try to connect to a failover until the primary is restored.
128      * If no failovers are defined, this will exit automatically.
129      *
130      * @since 3.1
131      */
132     protected void connectAndRestore()
133     {
134         final IRemoteCacheAttributes rca0 = getAuxiliaryCacheAttributes();
135         // Each RemoteCacheManager corresponds to one remote connection.
136         final List<RemoteLocation> failovers = rca0.getFailovers();
137         // we should probably check to see if there are any failovers,
138         // even though the caller should have already.
139 
140         if ( failovers == null )
141         {
142             log.warn( "Remote is misconfigured, failovers was null." );
143             return;
144         }
145         if ( failovers.size() == 1 )
146         {
147             // if there is only the primary, return out of this
148             log.info( "No failovers defined, exiting failover runner." );
149             return;
150         }
151 
152         final AtomicBoolean allright = new AtomicBoolean(false);
153 
154         do
155         {
156             log.info( "Remote cache FAILOVER RUNNING." );
157 
158             // there is no active listener
159             if ( !allright.get() )
160             {
161                 // Monitor each RemoteCacheManager instance one after the other.
162                 final int fidx = rca0.getFailoverIndex();
163                 log.debug( "fidx = {0} failovers.size = {1}", rca0::getFailoverIndex, failovers::size);
164 
165                 // If we don't check the primary, if it gets connected in the
166                 // background,
167                 // we will disconnect it only to put it right back
168                 final ListIterator<RemoteLocation> i = failovers.listIterator(fidx); // + 1; // +1 skips the primary
169                 log.debug( "starting at failover i = {0}", i );
170 
171                 // try them one at a time until successful
172                 while (i.hasNext() && !allright.get())
173                 {
174                     final int failoverIndex = i.nextIndex();
175                     final RemoteLocation server = i.next();
176                     log.debug("Trying server [{0}] at failover index i = {1}", server, failoverIndex);
177 
178                     final RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
179                     rca.setRemoteLocation(server);
180                     final RemoteCacheManager rcm = cacheFactory.getManager( rca );
181 
182                     log.debug( "RemoteCacheAttributes for failover = {0}", rca );
183 
184                     if (rcm != null)
185                     {
186                         // add a listener if there are none, need to tell rca
187                         // what number it is at
188                         final ICache<K, V> ic = rcm.getCache( rca );
189                         if ( ic.getStatus() == CacheStatus.ALIVE )
190                         {
191                             // may need to do this more gracefully
192                             log.debug( "resetting no wait" );
193                             restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
194                             rca0.setFailoverIndex(failoverIndex);
195 
196                             log.debug("setting ALLRIGHT to true");
197                             if (i.hasPrevious())
198                             {
199                                 log.debug("Moving to Primary Recovery Mode, failover index = {0}", failoverIndex);
200                             }
201                             else
202                             {
203                                 log.debug("No need to connect to failover, the primary server is back up.");
204                             }
205 
206                             allright.set(true);
207 
208                             log.info( "CONNECTED to host = [{0}]", rca::getRemoteLocation);
209                         }
210                     }
211                 }
212             }
213             // end if !allright
214             // get here if while index >0 and allright, meaning that we are
215             // connected to some backup server.
216             else
217             {
218                 log.debug( "ALLRIGHT is true " );
219                 log.info( "Failover runner is in primary recovery mode. "
220                         + "Failover index = {0} Will now try to reconnect to "
221                         + "primary server.", rca0::getFailoverIndex);
222             }
223 
224             // Exit loop if in test mode
225             if (allright.get() && !attemptRestorePrimary)
226             {
227                 break;
228             }
229 
230             boolean primaryRestoredSuccessfully = false;
231             // if we are not connected to the primary, try.
232             if (rca0.getFailoverIndex() > 0)
233             {
234                 primaryRestoredSuccessfully = restorePrimary();
235                 log.debug( "Primary recovery success state = {0}",
236                         primaryRestoredSuccessfully );
237             }
238 
239             if (!primaryRestoredSuccessfully)
240             {
241                 // Time driven mode: sleep between each round of recovery attempt.
242                 try
243                 {
244                     log.warn( "Failed to reconnect to primary server. "
245                             + "Cache failover runner is going to sleep for "
246                             + "{0} milliseconds.", idlePeriod );
247                     Thread.sleep( idlePeriod );
248                 }
249                 catch ( final InterruptedException ex )
250                 {
251                     // ignore;
252                 }
253             }
254 
255             // try to bring the listener back to the primary
256         }
257         while (rca0.getFailoverIndex() > 0 || !allright.get());
258         // continue if the primary is not restored or if things are not allright.
259 
260         if ( log.isInfoEnabled() )
261         {
262             final int failoverIndex = rca0.getFailoverIndex();
263             log.info( "Exiting failover runner. Failover index = {0}", failoverIndex);
264 
265             if ( failoverIndex <= 0 )
266             {
267                 log.info( "Failover index is <= 0, meaning we are not connected to a failover server." );
268             }
269             else
270             {
271                 log.info( "Failover index is > 0, meaning we are connected to a failover server." );
272             }
273         }
274     }
275 
276     /**
277      * Try to restore the primary server.
278      * <p>
279      * Once primary is restored the failover listener must be deregistered.
280      * <p>
281      * The primary server is the first server defines in the FailoverServers
282      * list.
283      *
284      * @return boolean value indicating whether the restoration was successful
285      */
286     private boolean restorePrimary()
287     {
288         final IRemoteCacheAttributes rca0 = getAuxiliaryCacheAttributes();
289         // try to move back to the primary
290         final RemoteLocation server = rca0.getFailovers().get(0);
291 
292         log.info( "Trying to restore connection to primary remote server "
293                 + "[{0}]", server );
294 
295         final RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
296         rca.setRemoteLocation(server);
297         final RemoteCacheManager rcm = cacheFactory.getManager( rca );
298 
299         if (rcm != null)
300         {
301             // add a listener if there are none, need to tell rca what number it
302             // is at
303             final ICache<K, V> ic = rcm.getCache( rca );
304             // by default the listener id should be 0, else it will be the
305             // listener
306             // Originally associated with the remote cache. either way is fine.
307             // We just don't want the listener id from a failover being used.
308             // If the remote server was rebooted this could be a problem if new
309             // locals were also added.
310 
311             if ( ic.getStatus() == CacheStatus.ALIVE )
312             {
313                 try
314                 {
315                     // we could have more than one listener registered right
316                     // now.
317                     // this will not result in a loop, only duplication
318                     // stop duplicate listening.
319                     if (getPrimaryServer() != null && getPrimaryServer().getStatus() == CacheStatus.ALIVE )
320                     {
321                         final int fidx = rca0.getFailoverIndex();
322 
323                         if ( fidx > 0 )
324                         {
325                             final RemoteLocation serverOld = rca0.getFailovers().get(fidx);
326 
327                             log.debug( "Failover Index = {0} the server at that "
328                                     + "index is [{1}]", fidx, serverOld );
329 
330                             if ( serverOld != null )
331                             {
332                                 // create attributes that reflect the
333                                 // previous failed over configuration.
334                                 final RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) rca0.clone();
335                                 rcaOld.setRemoteLocation(serverOld);
336                                 final RemoteCacheManager rcmOld = cacheFactory.getManager( rcaOld );
337 
338                                 if ( rcmOld != null )
339                                 {
340                                     // manager can remove by name if
341                                     // necessary
342                                     rcmOld.removeRemoteCacheListener( rcaOld );
343                                 }
344                                 log.info( "Successfully deregistered from "
345                                         + "FAILOVER remote server = {0}", serverOld );
346                             }
347                         }
348                         else if ( fidx == 0 )
349                         {
350                             // this should never happen. If there are no
351                             // failovers this shouldn't get called.
352                             if ( log.isDebugEnabled() )
353                             {
354                                 log.debug( "No need to restore primary, it is already restored." );
355                                 return true;
356                             }
357                         }
358                         else {
359                             // this should never happen
360                             log.warn( "Failover index is less than 0, this shouldn't happen" );
361                         }
362                     }
363                 }
364                 catch ( final IOException e )
365                 {
366                     // TODO, should try again, or somehow stop the listener
367                     log.error("Trouble trying to deregister old failover "
368                             + "listener prior to restoring the primary = {0}",
369                             server, e );
370                 }
371 
372                 // Restore primary
373                 // may need to do this more gracefully, letting the failover finish in the background
374                 final RemoteCacheNoWait<K, V> failoverNoWait = getPrimaryServer();
375 
376                 // swap in a new one
377                 restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
378                 rca0.setFailoverIndex( 0 );
379 
380                 final String message = "Successfully reconnected to PRIMARY "
381                         + "remote server. Substituted primary for "
382                         + "failoverNoWait [" + failoverNoWait + "]";
383                 log.info( message );
384 
385                 if (getCacheEventLogger() != null)
386                 {
387                     getCacheEventLogger().logApplicationEvent(
388                             "RemoteCacheFailoverRunner", "RestoredPrimary",
389                             message );
390                 }
391                 return true;
392             }
393         }
394 
395         // else all right
396         // if the failover index was at 0 here, we would be in a bad
397         // situation, unless there were just
398         // no failovers configured.
399         log.debug( "Primary server status in error, not connected." );
400 
401         return false;
402     }
403 }