001    package org.apache.jcs.auxiliary.remote;
002    
003    /*
004     * Licensed to the Apache Software Foundation (ASF) under one
005     * or more contributor license agreements.  See the NOTICE file
006     * distributed with this work for additional information
007     * regarding copyright ownership.  The ASF licenses this file
008     * to you under the Apache License, Version 2.0 (the
009     * "License"); you may not use this file except in compliance
010     * with the License.  You may obtain a copy of the License at
011     *
012     *   http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing,
015     * software distributed under the License is distributed on an
016     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017     * KIND, either express or implied.  See the License for the
018     * specific language governing permissions and limitations
019     * under the License.
020     */
021    
022    import java.io.IOException;
023    import java.io.Serializable;
024    
025    import org.apache.commons.logging.Log;
026    import org.apache.commons.logging.LogFactory;
027    import org.apache.jcs.engine.CacheStatus;
028    import org.apache.jcs.engine.behavior.ICache;
029    import org.apache.jcs.engine.behavior.ICompositeCacheManager;
030    import org.apache.jcs.engine.behavior.IElementSerializer;
031    import org.apache.jcs.engine.logging.behavior.ICacheEventLogger;
032    
033    /**
034     * The RemoteCacheFailoverRunner tries to establish a connection with a failover
035     * server, if any are defined. Once a failover connection is made, it will
036     * attempt to replace the failover with the primary remote server.
037     * <p>
038     * It works by switching out the RemoteCacheNoWait inside the Facade.
039     * <p>
040     * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
041     * This facade is created by the RemoteCacheFactory. The factory maintains a set
042     * of managers, one for each remote server. Typically, there will only be one
043     * manager.
044     * <p>
045     * If you use multiple remote servers, you may want to set one or more as
046     * failovers. If a local cache cannot connect to the primary server, or looses
047     * its connection to the primary server, it will attempt to restore that
048     * Connection in the background. If failovers are defined, the Failover runner
049     * will try to connect to a failover until the primary is restored.
050     *
051     */
052    public class RemoteCacheFailoverRunner<K extends Serializable, V extends Serializable>
053        implements Runnable
054    {
055        /** The logger */
056        private final static Log log = LogFactory.getLog( RemoteCacheFailoverRunner.class );
057    
058        /** The facade returned to the composite cache. */
059        private final RemoteCacheNoWaitFacade<K, V> facade;
060    
061        /** How long to wait between reconnect attempts. */
062        private static long idlePeriod = 20 * 1000;
063    
064        /** Have we reconnected. */
065        private boolean alright = true;
066    
067        /** The cache manager */
068        private final ICompositeCacheManager cacheMgr;
069    
070        /** The event logger. */
071        private final ICacheEventLogger cacheEventLogger;
072    
073        /** The serializer. */
074        private final IElementSerializer elementSerializer;
075    
076        /**
077         * Constructor for the RemoteCacheFailoverRunner object. This allows the
078         * FailoverRunner to modify the facade that the CompositeCache references.
079         *
080         * @param facade
081         *            the facade the CompositeCache talks to.
082         * @param cacheMgr
083         * @param cacheEventLogger
084         * @param elementSerializer
085         */
086        public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade<K, V> facade, ICompositeCacheManager cacheMgr,
087                                          ICacheEventLogger cacheEventLogger, IElementSerializer elementSerializer )
088        {
089            this.facade = facade;
090            this.cacheMgr = cacheMgr;
091            this.cacheEventLogger = cacheEventLogger;
092            this.elementSerializer = elementSerializer;
093        }
094    
095        /**
096         * Notifies the cache monitor that an error occurred, and kicks off the
097         * error recovery process.
098         */
099        public void notifyError()
100        {
101            bad();
102            synchronized ( this )
103            {
104                notify();
105            }
106        }
107    
108        /**
109         * Main processing method for the RemoteCacheFailoverRunner object.
110         * <p>
111         * If we do not have a connection with any failover server, this will try to
112         * connect one at a time. If no connection can be made, it goes to sleep for
113         * a while (20 seconds).
114         * <p>
115         * Once a connection with a failover is made, we will try to reconnect to
116         * the primary server.
117         * <p>
118         * The primary server is the first server defines in the FailoverServers
119         * list.
120         */
121        public void run()
122        {
123            // start the main work of connecting to a failover and then restoring
124            // the primary.
125            connectAndRestore();
126    
127            if ( log.isInfoEnabled() )
128            {
129                log.info( "Exiting failover runner. Failover index = " + facade.remoteCacheAttributes.getFailoverIndex() );
130                if ( facade.remoteCacheAttributes.getFailoverIndex() <= 0 )
131                {
132                    log.info( "Failover index is <= 0, meaning we are not " + "connected to a failover server." );
133                }
134                else if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
135                {
136                    log.info( "Failover index is > 0, meaning we are " + "connected to a failover server." );
137                }
138                // log if we are alright or not.
139            }
140        }
141    
142        /**
143         * This is the main loop. If there are failovers defined, then this will
144         * continue until the primary is re-connected. If no failovers are defined,
145         * this will exit automatically.
146         */
147        @SuppressWarnings("unchecked") // No generic arrays in java
148        private void connectAndRestore()
149        {
150            do
151            {
152                log.info( "Remote cache FAILOVER RUNNING." );
153    
154                // there is no active listener
155                if ( !alright )
156                {
157                    // Monitor each RemoteCacheManager instance one after the other.
158                    // Each RemoteCacheManager corresponds to one remote connection.
159                    String[] failovers = facade.remoteCacheAttributes.getFailovers();
160                    // we should probably check to see if there are any failovers,
161                    // even though the caller
162                    // should have already.
163    
164                    if ( failovers == null )
165                    {
166                        log.warn( "Remote is misconfigured, failovers was null." );
167                        return;
168                    }
169                    else if ( failovers.length == 1 )
170                    {
171                        // if there is only the primary, return out of this
172                        if ( log.isInfoEnabled() )
173                        {
174                            log.info( "No failovers defined, exiting failover runner." );
175                            return;
176                        }
177                    }
178    
179                    int fidx = facade.remoteCacheAttributes.getFailoverIndex();
180                    log.debug( "fidx = " + fidx + " failovers.length = " + failovers.length );
181    
182                    // shouldn't we see if the primary is backup?
183                    // If we don't check the primary, if it gets connected in the
184                    // background,
185                    // we will disconnect it only to put it right back
186                    int i = fidx; // + 1; // +1 skips the primary
187                    if ( log.isDebugEnabled() )
188                    {
189                        log.debug( "stating at failover i = " + i );
190                    }
191    
192                    // try them one at a time until successful
193                    for ( ; i < failovers.length && !alright; i++ )
194                    {
195                        String server = failovers[i];
196                        if ( log.isDebugEnabled() )
197                        {
198                            log.debug( "Trying server [" + server + "] at failover index i = " + i );
199                        }
200    
201                        RemoteCacheAttributes rca = null;
202                        try
203                        {
204                            rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
205                            rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
206                            rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
207                            RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
208    
209                            if ( log.isDebugEnabled() )
210                            {
211                                log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
212                            }
213    
214                            // add a listener if there are none, need to tell rca
215                            // what number it is at
216                            ICache<K, V> ic = rcm.getCache( rca.getCacheName() );
217                            if ( ic != null )
218                            {
219                                if ( ic.getStatus() == CacheStatus.ALIVE )
220                                {
221                                    // may need to do this more gracefully
222                                    log.debug( "resetting no wait" );
223                                    facade.noWaits = new RemoteCacheNoWait[1];
224                                    facade.noWaits[0] = (RemoteCacheNoWait<K, V>) ic;
225                                    facade.remoteCacheAttributes.setFailoverIndex( i );
226    
227                                    synchronized ( this )
228                                    {
229                                        if ( log.isDebugEnabled() )
230                                        {
231                                            log.debug( "setting ALRIGHT to true" );
232                                            if ( i > 0 )
233                                            {
234                                                log.debug( "Moving to Primary Recovery Mode, failover index = " + i );
235                                            }
236                                            else
237                                            {
238                                                if ( log.isInfoEnabled() )
239                                                {
240                                                    String message = "No need to connect to failover, the primary server is back up.";
241                                                    log.info( message );
242                                                }
243                                            }
244                                        }
245    
246                                        alright = true;
247    
248                                        if ( log.isInfoEnabled() )
249                                        {
250                                            log.info( "CONNECTED to host = [" + rca.getRemoteHost() + "] port = ["
251                                                + rca.getRemotePort() + "]" );
252                                        }
253                                    }
254                                }
255                            }
256                            else
257                            {
258                                log.info( "noWait is null" );
259                            }
260                        }
261                        catch ( Exception ex )
262                        {
263                            bad();
264                            // Problem encountered in fixing the caches managed by a
265                            // RemoteCacheManager instance.
266                            // Soldier on to the next RemoteCacheManager instance.
267                            String remoteHost = (rca == null) ? "null" : rca.getRemoteHost();
268                            int remotePort = (rca == null) ? 0 : rca.getRemotePort();
269                            if ( i == 0 )
270                            {
271                                log.warn( "FAILED to connect, as expected, to primary" + remoteHost + ":"
272                                    + remotePort, ex );
273                            }
274                            else
275                            {
276                                log.error( "FAILED to connect to failover [" + remoteHost + ":"
277                                    + remotePort + "]", ex );
278                            }
279                        }
280                    }
281                }
282                // end if !alright
283                // get here if while index >0 and alright, meaning that we are
284                // connected to some backup server.
285                else
286                {
287                    if ( log.isDebugEnabled() )
288                    {
289                        log.debug( "ALRIGHT is true " );
290                    }
291                    if ( log.isInfoEnabled() )
292                    {
293                        log.info( "Failover runner is in primary recovery mode. Failover index = "
294                            + facade.remoteCacheAttributes.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
295                    }
296                }
297    
298                boolean primaryRestoredSuccessfully = false;
299                // if we are not connected to the primary, try.
300                if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
301                {
302                    primaryRestoredSuccessfully = restorePrimary();
303                    if ( log.isDebugEnabled() )
304                    {
305                        log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
306                    }
307                }
308    
309                if ( !primaryRestoredSuccessfully )
310                {
311                    // Time driven mode: sleep between each round of recovery
312                    // attempt.
313                    try
314                    {
315                        log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
316                            + idlePeriod + " milliseconds." );
317                        Thread.sleep( idlePeriod );
318                    }
319                    catch ( InterruptedException ex )
320                    {
321                        // ignore;
322                    }
323                }
324    
325                // try to bring the listener back to the primary
326            }
327            while ( facade.remoteCacheAttributes.getFailoverIndex() > 0 || !alright );
328            // continue if the primary is not restored or if things are not alright.
329    
330        }
331    
332        /**
333         * Try to restore the primary server.
334         * <p>
335         * Once primary is restored the failover listener must be deregistered.
336         * <p>
337         * The primary server is the first server defines in the FailoverServers
338         * list.
339         *
340         * @return boolean value indicating whether the restoration was successful
341         */
342        @SuppressWarnings("unchecked") // No generic arrays in java
343        private boolean restorePrimary()
344        {
345            // try to move back to the primary
346            String[] failovers = facade.remoteCacheAttributes.getFailovers();
347            String server = failovers[0];
348    
349            if ( log.isInfoEnabled() )
350            {
351                log.info( "Trying to restore connection to primary remote server [" + server + "]" );
352            }
353    
354            try
355            {
356                RemoteCacheAttributes rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
357                rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
358                rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
359                RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr, cacheEventLogger, elementSerializer );
360    
361                // add a listener if there are none, need to tell rca what number it
362                // is at
363                ICache<K, V> ic = rcm.getCache( rca.getCacheName() );
364                // by default the listener id should be 0, else it will be the
365                // listener
366                // Originally associated with the remote cache. either way is fine.
367                // We just don't want the listener id from a failover being used.
368                // If the remote server was rebooted this could be a problem if new
369                // locals were also added.
370    
371                if ( ic != null )
372                {
373                    if ( ic.getStatus() == CacheStatus.ALIVE )
374                    {
375                        try
376                        {
377                            // we could have more than one listener registered right
378                            // now.
379                            // this will not result in a loop, only duplication
380                            // stop duplicate listening.
381                            if ( facade.noWaits[0] != null && facade.noWaits[0].getStatus() == CacheStatus.ALIVE )
382                            {
383                                int fidx = facade.remoteCacheAttributes.getFailoverIndex();
384    
385                                if ( fidx > 0 )
386                                {
387                                    String serverOld = failovers[fidx];
388    
389                                    if ( log.isDebugEnabled() )
390                                    {
391                                        log.debug( "Failover Index = " + fidx + " the server at that index is ["
392                                            + serverOld + "]" );
393                                    }
394    
395                                    if ( serverOld != null )
396                                    {
397                                        // create attributes that reflect the
398                                        // previous failed over configuration.
399                                        RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
400                                        rcaOld.setRemoteHost( serverOld.substring( 0, serverOld.indexOf( ":" ) ) );
401                                        rcaOld.setRemotePort( Integer.parseInt( serverOld.substring( serverOld
402                                            .indexOf( ":" ) + 1 ) ) );
403                                        RemoteCacheManager rcmOld = RemoteCacheManager.getInstance( rcaOld, cacheMgr, cacheEventLogger, elementSerializer );
404    
405                                        if ( rcmOld != null )
406                                        {
407                                            // manager can remove by name if
408                                            // necessary
409                                            rcmOld.removeRemoteCacheListener( rcaOld );
410                                        }
411                                        if ( log.isInfoEnabled() )
412                                        {
413                                            log.info( "Successfully deregistered from FAILOVER remote server = "
414                                                + serverOld );
415                                        }
416                                    }
417                                }
418                                else if ( fidx == 0 )
419                                {
420                                    // this should never happen. If there are no
421                                    // failovers this shouldn't get called.
422                                    if ( log.isDebugEnabled() )
423                                    {
424                                        log.debug( "No need to restore primary, it is already restored." );
425                                        return true;
426                                    }
427                                }
428                                else if ( fidx < 0 )
429                                {
430                                    // this should never happen
431                                    log.warn( "Failover index is less than 0, this shouldn't happen" );
432                                }
433                            }
434                        }
435                        catch ( IOException e )
436                        {
437                            // TODO, should try again, or somehow stop the listener
438                            log.error(
439                                       "Trouble trying to deregister old failover listener prior to restoring the primary = "
440                                           + server, e );
441                        }
442    
443                        // Restore primary
444                        // may need to do this more gracefully, letting the failover finish in the background
445                        RemoteCacheNoWait<K, V> failoverNoWait = facade.noWaits[0];
446    
447                        // swap in a new one
448                        facade.noWaits = new RemoteCacheNoWait[1];
449                        facade.noWaits[0] = (RemoteCacheNoWait<K, V>) ic;
450                        facade.remoteCacheAttributes.setFailoverIndex( 0 );
451    
452                        if ( log.isInfoEnabled() )
453                        {
454                            String message = "Successfully reconnected to PRIMARY remote server.  Substituted primary for failoverNoWait [" + failoverNoWait + "]";
455                            log.info( message );
456    
457                            if ( facade.getCacheEventLogger() != null )
458                            {
459                                facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary",
460                                                                                  message );
461                            }
462                        }
463                        return true;
464                    }
465    
466                    // else all right
467                    // if the failover index was at 0 here, we would be in a bad
468                    // situation, unless there were just
469                    // no failovers configured.
470                    if ( log.isDebugEnabled() )
471                    {
472                        log.debug( "Primary server status in error, not connected." );
473                    }
474                }
475                else
476                {
477                    if ( log.isDebugEnabled() )
478                    {
479                        log.debug( "Primary server is null, not connected." );
480                    }
481                }
482            }
483            catch ( NumberFormatException ex )
484            {
485                log.error( ex );
486            }
487            return false;
488        }
489    
490        /**
491         * Sets the "alright" flag to false in a critical section. This flag
492         * indicates whether or not we are connected to any server at all. If we are
493         * connected to a secondary server, then alright will be true, but we will
494         * continue to try to restore the connection with the primary server.
495         * <p>
496         * The primary server is the first server defines in the FailoverServers
497         * list.
498         */
499        private void bad()
500        {
501            if ( alright )
502            {
503                synchronized ( this )
504                {
505                    alright = false;
506                }
507            }
508        }
509    }