001package org.apache.commons.jcs.auxiliary.remote;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import java.io.IOException;
023import java.util.List;
024import java.util.ListIterator;
025
026import org.apache.commons.jcs.auxiliary.AbstractAuxiliaryCacheMonitor;
027import org.apache.commons.jcs.auxiliary.remote.behavior.IRemoteCacheAttributes;
028import org.apache.commons.jcs.engine.CacheStatus;
029import org.apache.commons.jcs.engine.behavior.ICache;
030
031/**
032 * The RemoteCacheFailoverRunner tries to establish a connection with a failover
033 * server, if any are defined. Once a failover connection is made, it will
034 * attempt to replace the failover with the primary remote server.
035 * <p>
036 * It works by switching out the RemoteCacheNoWait inside the Facade.
037 * <p>
038 * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
039 * This facade is created by the RemoteCacheFactory. The factory maintains a set
040 * of managers, one for each remote server. Typically, there will only be one
041 * manager.
042 * <p>
043 * If you use multiple remote servers, you may want to set one or more as
044 * failovers. If a local cache cannot connect to the primary server, or looses
045 * its connection to the primary server, it will attempt to restore that
046 * Connection in the background. If failovers are defined, the Failover runner
047 * will try to connect to a failover until the primary is restored.
048 *
049 */
050public class RemoteCacheFailoverRunner<K, V> extends AbstractAuxiliaryCacheMonitor
051{
052    /** The facade returned to the composite cache. */
053    private final RemoteCacheNoWaitFacade<K, V> facade;
054
055    /** Factory instance */
056    private final RemoteCacheFactory cacheFactory;
057
058    /**
059     * Constructor for the RemoteCacheFailoverRunner object. This allows the
060     * FailoverRunner to modify the facade that the CompositeCache references.
061     *
062     * @param facade the facade the CompositeCache talks to.
063     * @param cacheFactory the cache factory instance
064     */
065    public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade<K, V> facade, RemoteCacheFactory cacheFactory )
066    {
067        super("JCS-RemoteCacheFailoverRunner");
068        this.facade = facade;
069        this.cacheFactory = cacheFactory;
070        setIdlePeriod(20000L);
071    }
072
073    /**
074     * Clean up all resources before shutdown
075     */
076    @Override
077    protected void dispose()
078    {
079        // empty
080    }
081
082    /**
083     * do actual work
084     */
085    @Override
086    protected void doWork()
087    {
088        // empty
089    }
090
091
092    /**
093     * Main processing method for the RemoteCacheFailoverRunner object.
094     * <p>
095     * If we do not have a connection with any failover server, this will try to
096     * connect one at a time. If no connection can be made, it goes to sleep for
097     * a while (20 seconds).
098     * <p>
099     * Once a connection with a failover is made, we will try to reconnect to
100     * the primary server.
101     * <p>
102     * The primary server is the first server defines in the FailoverServers
103     * list.
104     */
105    @Override
106    public void run()
107    {
108        // start the main work of connecting to a failover and then restoring
109        // the primary.
110        connectAndRestore();
111
112        if ( log.isInfoEnabled() )
113        {
114            int failoverIndex = facade.getAuxiliaryCacheAttributes().getFailoverIndex();
115            log.info( "Exiting failover runner. Failover index = " + failoverIndex);
116
117            if ( failoverIndex <= 0 )
118            {
119                log.info( "Failover index is <= 0, meaning we are not connected to a failover server." );
120            }
121            else if ( failoverIndex > 0 )
122            {
123                log.info( "Failover index is > 0, meaning we are connected to a failover server." );
124            }
125            // log if we are allright or not.
126        }
127    }
128
129    /**
130     * This is the main loop. If there are failovers defined, then this will
131     * continue until the primary is re-connected. If no failovers are defined,
132     * this will exit automatically.
133     */
134    private void connectAndRestore()
135    {
136        IRemoteCacheAttributes rca0 = facade.getAuxiliaryCacheAttributes();
137
138        do
139        {
140            log.info( "Remote cache FAILOVER RUNNING." );
141
142            // there is no active listener
143            if ( !allright.get() )
144            {
145                // Monitor each RemoteCacheManager instance one after the other.
146                // Each RemoteCacheManager corresponds to one remote connection.
147                List<RemoteLocation> failovers = rca0.getFailovers();
148                // we should probably check to see if there are any failovers,
149                // even though the caller
150                // should have already.
151
152                if ( failovers == null )
153                {
154                    log.warn( "Remote is misconfigured, failovers was null." );
155                    return;
156                }
157                else if ( failovers.size() == 1 )
158                {
159                    // if there is only the primary, return out of this
160                    log.info( "No failovers defined, exiting failover runner." );
161                    return;
162                }
163
164                int fidx = rca0.getFailoverIndex();
165                log.debug( "fidx = " + fidx + " failovers.size = " + failovers.size() );
166
167                // shouldn't we see if the primary is backup?
168                // If we don't check the primary, if it gets connected in the
169                // background,
170                // we will disconnect it only to put it right back
171                ListIterator<RemoteLocation> i = failovers.listIterator(fidx); // + 1; // +1 skips the primary
172                if ( log.isDebugEnabled() )
173                {
174                    log.debug( "starting at failover i = " + i.nextIndex() );
175                }
176
177                // try them one at a time until successful
178                for ( ; i.hasNext() && !allright.get();)
179                {
180                    RemoteLocation server = i.next();
181                    if ( log.isDebugEnabled() )
182                    {
183                        log.debug( "Trying server [" + server + "] at failover index i = " + i );
184                    }
185
186                    RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
187                    rca.setRemoteLocation(server);
188                    RemoteCacheManager rcm = cacheFactory.getManager( rca );
189
190                    if ( log.isDebugEnabled() )
191                    {
192                        log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
193                    }
194
195                    if (rcm != null)
196                    {
197                        // add a listener if there are none, need to tell rca
198                        // what number it is at
199                        ICache<K, V> ic = rcm.getCache( rca );
200                        if ( ic.getStatus() == CacheStatus.ALIVE )
201                        {
202                            // may need to do this more gracefully
203                            log.debug( "resetting no wait" );
204                            facade.restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
205                            rca0.setFailoverIndex( i.nextIndex() );
206
207                            if ( log.isDebugEnabled() )
208                            {
209                                log.debug( "setting ALLRIGHT to true" );
210                                if ( i.hasPrevious() )
211                                {
212                                    log.debug( "Moving to Primary Recovery Mode, failover index = " + i.nextIndex() );
213                                }
214                                else
215                                {
216                                    log.debug( "No need to connect to failover, the primary server is back up." );
217                                }
218                            }
219
220                            allright.set(true);
221
222                            if ( log.isInfoEnabled() )
223                            {
224                                log.info( "CONNECTED to host = [" + rca.getRemoteLocation() + "]" );
225                            }
226                        }
227                    }
228                }
229            }
230            // end if !allright
231            // get here if while index >0 and allright, meaning that we are
232            // connected to some backup server.
233            else
234            {
235                if ( log.isDebugEnabled() )
236                {
237                    log.debug( "ALLRIGHT is true " );
238                }
239                if ( log.isInfoEnabled() )
240                {
241                    log.info( "Failover runner is in primary recovery mode. Failover index = "
242                        + rca0.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
243                }
244            }
245
246            boolean primaryRestoredSuccessfully = false;
247            // if we are not connected to the primary, try.
248            if ( rca0.getFailoverIndex() > 0 )
249            {
250                primaryRestoredSuccessfully = restorePrimary();
251                if ( log.isDebugEnabled() )
252                {
253                    log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
254                }
255            }
256
257            if ( !primaryRestoredSuccessfully )
258            {
259                // Time driven mode: sleep between each round of recovery
260                // attempt.
261                try
262                {
263                    log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
264                        + idlePeriod + " milliseconds." );
265                    Thread.sleep( idlePeriod );
266                }
267                catch ( InterruptedException ex )
268                {
269                    // ignore;
270                }
271            }
272
273            // try to bring the listener back to the primary
274        }
275        while ( rca0.getFailoverIndex() > 0 || !allright.get() );
276        // continue if the primary is not restored or if things are not allright.
277    }
278
279    /**
280     * Try to restore the primary server.
281     * <p>
282     * Once primary is restored the failover listener must be deregistered.
283     * <p>
284     * The primary server is the first server defines in the FailoverServers
285     * list.
286     *
287     * @return boolean value indicating whether the restoration was successful
288     */
289    private boolean restorePrimary()
290    {
291        IRemoteCacheAttributes rca0 = facade.getAuxiliaryCacheAttributes();
292        // try to move back to the primary
293        RemoteLocation server = rca0.getFailovers().get(0);
294
295        if ( log.isInfoEnabled() )
296        {
297            log.info( "Trying to restore connection to primary remote server [" + server + "]" );
298        }
299
300        RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
301        rca.setRemoteLocation(server);
302        RemoteCacheManager rcm = cacheFactory.getManager( rca );
303
304        if (rcm != null)
305        {
306            // add a listener if there are none, need to tell rca what number it
307            // is at
308            ICache<K, V> ic = rcm.getCache( rca );
309            // by default the listener id should be 0, else it will be the
310            // listener
311            // Originally associated with the remote cache. either way is fine.
312            // We just don't want the listener id from a failover being used.
313            // If the remote server was rebooted this could be a problem if new
314            // locals were also added.
315
316            if ( ic.getStatus() == CacheStatus.ALIVE )
317            {
318                try
319                {
320                    // we could have more than one listener registered right
321                    // now.
322                    // this will not result in a loop, only duplication
323                    // stop duplicate listening.
324                    if ( facade.getPrimaryServer() != null && facade.getPrimaryServer().getStatus() == CacheStatus.ALIVE )
325                    {
326                        int fidx = rca0.getFailoverIndex();
327
328                        if ( fidx > 0 )
329                        {
330                            RemoteLocation serverOld = rca0.getFailovers().get(fidx);
331
332                            if ( log.isDebugEnabled() )
333                            {
334                                log.debug( "Failover Index = " + fidx + " the server at that index is ["
335                                    + serverOld + "]" );
336                            }
337
338                            if ( serverOld != null )
339                            {
340                                // create attributes that reflect the
341                                // previous failed over configuration.
342                                RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) rca0.clone();
343                                rcaOld.setRemoteLocation(serverOld);
344                                RemoteCacheManager rcmOld = cacheFactory.getManager( rcaOld );
345
346                                if ( rcmOld != null )
347                                {
348                                    // manager can remove by name if
349                                    // necessary
350                                    rcmOld.removeRemoteCacheListener( rcaOld );
351                                }
352                                if ( log.isInfoEnabled() )
353                                {
354                                    log.info( "Successfully deregistered from FAILOVER remote server = "
355                                        + serverOld );
356                                }
357                            }
358                        }
359                        else if ( fidx == 0 )
360                        {
361                            // this should never happen. If there are no
362                            // failovers this shouldn't get called.
363                            if ( log.isDebugEnabled() )
364                            {
365                                log.debug( "No need to restore primary, it is already restored." );
366                                return true;
367                            }
368                        }
369                        else if ( fidx < 0 )
370                        {
371                            // this should never happen
372                            log.warn( "Failover index is less than 0, this shouldn't happen" );
373                        }
374                    }
375                }
376                catch ( IOException e )
377                {
378                    // TODO, should try again, or somehow stop the listener
379                    log.error("Trouble trying to deregister old failover listener prior to restoring the primary = "
380                           + server, e );
381                }
382
383                // Restore primary
384                // may need to do this more gracefully, letting the failover finish in the background
385                RemoteCacheNoWait<K, V> failoverNoWait = facade.getPrimaryServer();
386
387                // swap in a new one
388                facade.restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
389                rca0.setFailoverIndex( 0 );
390
391                if ( log.isInfoEnabled() )
392                {
393                    String message = "Successfully reconnected to PRIMARY remote server.  Substituted primary for failoverNoWait ["
394                        + failoverNoWait + "]";
395                    log.info( message );
396
397                    if ( facade.getCacheEventLogger() != null )
398                    {
399                        facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary", message );
400                    }
401                }
402                return true;
403            }
404        }
405
406        // else all right
407        // if the failover index was at 0 here, we would be in a bad
408        // situation, unless there were just
409        // no failovers configured.
410        if ( log.isDebugEnabled() )
411        {
412            log.debug( "Primary server status in error, not connected." );
413        }
414
415        return false;
416    }
417}