001package org.apache.commons.jcs3.auxiliary.remote;
002
003import java.io.IOException;
004
005/*
006 * Licensed to the Apache Software Foundation (ASF) under one
007 * or more contributor license agreements.  See the NOTICE file
008 * distributed with this work for additional information
009 * regarding copyright ownership.  The ASF licenses this file
010 * to you under the Apache License, Version 2.0 (the
011 * "License"); you may not use this file except in compliance
012 * with the License.  You may obtain a copy of the License at
013 *
014 *   http://www.apache.org/licenses/LICENSE-2.0
015 *
016 * Unless required by applicable law or agreed to in writing,
017 * software distributed under the License is distributed on an
018 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
019 * KIND, either express or implied.  See the License for the
020 * specific language governing permissions and limitations
021 * under the License.
022 */
023
024import java.util.List;
025import java.util.ListIterator;
026import java.util.concurrent.atomic.AtomicBoolean;
027
028import org.apache.commons.jcs3.auxiliary.remote.behavior.IRemoteCacheAttributes;
029import org.apache.commons.jcs3.auxiliary.remote.server.behavior.RemoteType;
030import org.apache.commons.jcs3.engine.CacheStatus;
031import org.apache.commons.jcs3.engine.behavior.ICache;
032import org.apache.commons.jcs3.engine.behavior.IElementSerializer;
033import org.apache.commons.jcs3.engine.logging.behavior.ICacheEventLogger;
034import org.apache.commons.jcs3.log.Log;
035import org.apache.commons.jcs3.log.LogManager;
036
037/**
038 * Used to provide access to multiple services under nowait protection. Factory should construct
039 * NoWaitFacade to give to the composite cache out of caches it constructs from the varies manager
040 * to lateral services.
041 * <p>
042 * Typically, we only connect to one remote server per facade. We use a list of one
043 * RemoteCacheNoWait.
044 */
045public class RemoteCacheNoWaitFacade<K, V>
046    extends AbstractRemoteCacheNoWaitFacade<K, V>
047{
048    /** log instance */
049    private static final Log log = LogManager.getLog( RemoteCacheNoWaitFacade.class );
050
051    /** Provide factory instance to RemoteCacheFailoverRunner */
052    private final RemoteCacheFactory cacheFactory;
053
054    /** Attempt to restore primary connection (switched off for testing) */
055    protected boolean attemptRestorePrimary = true;
056
057    /** Time in ms to sleep between failover attempts */
058    private static final long idlePeriod = 20000L;
059
060    /**
061     * Constructs with the given remote cache, and fires events to any listeners.
062     * <p>
063     * @param noWaits
064     * @param rca
065     * @param cacheEventLogger
066     * @param elementSerializer
067     * @param cacheFactory
068     */
069    public RemoteCacheNoWaitFacade( final List<RemoteCacheNoWait<K,V>> noWaits,
070                                    final IRemoteCacheAttributes rca,
071                                    final ICacheEventLogger cacheEventLogger,
072                                    final IElementSerializer elementSerializer,
073                                    final RemoteCacheFactory cacheFactory)
074    {
075        super( noWaits, rca, cacheEventLogger, elementSerializer );
076        this.cacheFactory = cacheFactory;
077    }
078
079    /**
080     * Begin the failover process if this is a local cache. Clustered remote caches do not failover.
081     * <p>
082     * @param rcnw The no wait in error.
083     */
084    @Override
085    protected void failover( final RemoteCacheNoWait<K, V> rcnw )
086    {
087        log.debug( "in failover for {0}", rcnw );
088
089        if ( getAuxiliaryCacheAttributes().getRemoteType() == RemoteType.LOCAL )
090        {
091            if ( rcnw.getStatus() == CacheStatus.ERROR )
092            {
093                // start failover, primary recovery process
094                final Thread runner = new Thread(this::connectAndRestore);
095                runner.setDaemon( true );
096                runner.start();
097
098                if ( getCacheEventLogger() != null )
099                {
100                    getCacheEventLogger().logApplicationEvent( "RemoteCacheNoWaitFacade", "InitiatedFailover",
101                                                               rcnw + " was in error." );
102                }
103            }
104            else
105            {
106                log.info( "The noWait is not in error" );
107            }
108        }
109    }
110
111    /**
112     * The thread tries to establish a connection with a failover
113     * server, if any are defined. Once a failover connection is made, it will
114     * attempt to replace the failover with the primary remote server.
115     * <p>
116     * It works by switching out the RemoteCacheNoWait inside the Facade.
117     * <p>
118     * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
119     * This facade is created by the RemoteCacheFactory. The factory maintains a set
120     * of managers, one for each remote server. Typically, there will only be one
121     * manager.
122     * <p>
123     * If you use multiple remote servers, you may want to set one or more as
124     * failovers. If a local cache cannot connect to the primary server, or looses
125     * its connection to the primary server, it will attempt to restore that
126     * Connection in the background. If failovers are defined, the Failover runner
127     * will try to connect to a failover until the primary is restored.
128     * If no failovers are defined, this will exit automatically.
129     *
130     * @since 3.1
131     */
132    protected void connectAndRestore()
133    {
134        final IRemoteCacheAttributes rca0 = getAuxiliaryCacheAttributes();
135        // Each RemoteCacheManager corresponds to one remote connection.
136        final List<RemoteLocation> failovers = rca0.getFailovers();
137        // we should probably check to see if there are any failovers,
138        // even though the caller should have already.
139
140        if ( failovers == null )
141        {
142            log.warn( "Remote is misconfigured, failovers was null." );
143            return;
144        }
145        if ( failovers.size() == 1 )
146        {
147            // if there is only the primary, return out of this
148            log.info( "No failovers defined, exiting failover runner." );
149            return;
150        }
151
152        final AtomicBoolean allright = new AtomicBoolean(false);
153
154        do
155        {
156            log.info( "Remote cache FAILOVER RUNNING." );
157
158            // there is no active listener
159            if ( !allright.get() )
160            {
161                // Monitor each RemoteCacheManager instance one after the other.
162                final int fidx = rca0.getFailoverIndex();
163                log.debug( "fidx = {0} failovers.size = {1}", rca0::getFailoverIndex, failovers::size);
164
165                // If we don't check the primary, if it gets connected in the
166                // background,
167                // we will disconnect it only to put it right back
168                final ListIterator<RemoteLocation> i = failovers.listIterator(fidx); // + 1; // +1 skips the primary
169                log.debug( "starting at failover i = {0}", i );
170
171                // try them one at a time until successful
172                while (i.hasNext() && !allright.get())
173                {
174                    final int failoverIndex = i.nextIndex();
175                    final RemoteLocation server = i.next();
176                    log.debug("Trying server [{0}] at failover index i = {1}", server, failoverIndex);
177
178                    final RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
179                    rca.setRemoteLocation(server);
180                    final RemoteCacheManager rcm = cacheFactory.getManager( rca );
181
182                    log.debug( "RemoteCacheAttributes for failover = {0}", rca );
183
184                    if (rcm != null)
185                    {
186                        // add a listener if there are none, need to tell rca
187                        // what number it is at
188                        final ICache<K, V> ic = rcm.getCache( rca );
189                        if ( ic.getStatus() == CacheStatus.ALIVE )
190                        {
191                            // may need to do this more gracefully
192                            log.debug( "resetting no wait" );
193                            restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
194                            rca0.setFailoverIndex(failoverIndex);
195
196                            log.debug("setting ALLRIGHT to true");
197                            if (i.hasPrevious())
198                            {
199                                log.debug("Moving to Primary Recovery Mode, failover index = {0}", failoverIndex);
200                            }
201                            else
202                            {
203                                log.debug("No need to connect to failover, the primary server is back up.");
204                            }
205
206                            allright.set(true);
207
208                            log.info( "CONNECTED to host = [{0}]", rca::getRemoteLocation);
209                        }
210                    }
211                }
212            }
213            // end if !allright
214            // get here if while index >0 and allright, meaning that we are
215            // connected to some backup server.
216            else
217            {
218                log.debug( "ALLRIGHT is true " );
219                log.info( "Failover runner is in primary recovery mode. "
220                        + "Failover index = {0} Will now try to reconnect to "
221                        + "primary server.", rca0::getFailoverIndex);
222            }
223
224            // Exit loop if in test mode
225            if (allright.get() && !attemptRestorePrimary)
226            {
227                break;
228            }
229
230            boolean primaryRestoredSuccessfully = false;
231            // if we are not connected to the primary, try.
232            if (rca0.getFailoverIndex() > 0)
233            {
234                primaryRestoredSuccessfully = restorePrimary();
235                log.debug( "Primary recovery success state = {0}",
236                        primaryRestoredSuccessfully );
237            }
238
239            if (!primaryRestoredSuccessfully)
240            {
241                // Time driven mode: sleep between each round of recovery attempt.
242                try
243                {
244                    log.warn( "Failed to reconnect to primary server. "
245                            + "Cache failover runner is going to sleep for "
246                            + "{0} milliseconds.", idlePeriod );
247                    Thread.sleep( idlePeriod );
248                }
249                catch ( final InterruptedException ex )
250                {
251                    // ignore;
252                }
253            }
254
255            // try to bring the listener back to the primary
256        }
257        while (rca0.getFailoverIndex() > 0 || !allright.get());
258        // continue if the primary is not restored or if things are not allright.
259
260        if ( log.isInfoEnabled() )
261        {
262            final int failoverIndex = rca0.getFailoverIndex();
263            log.info( "Exiting failover runner. Failover index = {0}", failoverIndex);
264
265            if ( failoverIndex <= 0 )
266            {
267                log.info( "Failover index is <= 0, meaning we are not connected to a failover server." );
268            }
269            else
270            {
271                log.info( "Failover index is > 0, meaning we are connected to a failover server." );
272            }
273        }
274    }
275
276    /**
277     * Try to restore the primary server.
278     * <p>
279     * Once primary is restored the failover listener must be deregistered.
280     * <p>
281     * The primary server is the first server defines in the FailoverServers
282     * list.
283     *
284     * @return boolean value indicating whether the restoration was successful
285     */
286    private boolean restorePrimary()
287    {
288        final IRemoteCacheAttributes rca0 = getAuxiliaryCacheAttributes();
289        // try to move back to the primary
290        final RemoteLocation server = rca0.getFailovers().get(0);
291
292        log.info( "Trying to restore connection to primary remote server "
293                + "[{0}]", server );
294
295        final RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
296        rca.setRemoteLocation(server);
297        final RemoteCacheManager rcm = cacheFactory.getManager( rca );
298
299        if (rcm != null)
300        {
301            // add a listener if there are none, need to tell rca what number it
302            // is at
303            final ICache<K, V> ic = rcm.getCache( rca );
304            // by default the listener id should be 0, else it will be the
305            // listener
306            // Originally associated with the remote cache. either way is fine.
307            // We just don't want the listener id from a failover being used.
308            // If the remote server was rebooted this could be a problem if new
309            // locals were also added.
310
311            if ( ic.getStatus() == CacheStatus.ALIVE )
312            {
313                try
314                {
315                    // we could have more than one listener registered right
316                    // now.
317                    // this will not result in a loop, only duplication
318                    // stop duplicate listening.
319                    if (getPrimaryServer() != null && getPrimaryServer().getStatus() == CacheStatus.ALIVE )
320                    {
321                        final int fidx = rca0.getFailoverIndex();
322
323                        if ( fidx > 0 )
324                        {
325                            final RemoteLocation serverOld = rca0.getFailovers().get(fidx);
326
327                            log.debug( "Failover Index = {0} the server at that "
328                                    + "index is [{1}]", fidx, serverOld );
329
330                            if ( serverOld != null )
331                            {
332                                // create attributes that reflect the
333                                // previous failed over configuration.
334                                final RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) rca0.clone();
335                                rcaOld.setRemoteLocation(serverOld);
336                                final RemoteCacheManager rcmOld = cacheFactory.getManager( rcaOld );
337
338                                if ( rcmOld != null )
339                                {
340                                    // manager can remove by name if
341                                    // necessary
342                                    rcmOld.removeRemoteCacheListener( rcaOld );
343                                }
344                                log.info( "Successfully deregistered from "
345                                        + "FAILOVER remote server = {0}", serverOld );
346                            }
347                        }
348                        else if ( fidx == 0 )
349                        {
350                            // this should never happen. If there are no
351                            // failovers this shouldn't get called.
352                            if ( log.isDebugEnabled() )
353                            {
354                                log.debug( "No need to restore primary, it is already restored." );
355                                return true;
356                            }
357                        }
358                        else {
359                            // this should never happen
360                            log.warn( "Failover index is less than 0, this shouldn't happen" );
361                        }
362                    }
363                }
364                catch ( final IOException e )
365                {
366                    // TODO, should try again, or somehow stop the listener
367                    log.error("Trouble trying to deregister old failover "
368                            + "listener prior to restoring the primary = {0}",
369                            server, e );
370                }
371
372                // Restore primary
373                // may need to do this more gracefully, letting the failover finish in the background
374                final RemoteCacheNoWait<K, V> failoverNoWait = getPrimaryServer();
375
376                // swap in a new one
377                restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
378                rca0.setFailoverIndex( 0 );
379
380                final String message = "Successfully reconnected to PRIMARY "
381                        + "remote server. Substituted primary for "
382                        + "failoverNoWait [" + failoverNoWait + "]";
383                log.info( message );
384
385                if (getCacheEventLogger() != null)
386                {
387                    getCacheEventLogger().logApplicationEvent(
388                            "RemoteCacheFailoverRunner", "RestoredPrimary",
389                            message );
390                }
391                return true;
392            }
393        }
394
395        // else all right
396        // if the failover index was at 0 here, we would be in a bad
397        // situation, unless there were just
398        // no failovers configured.
399        log.debug( "Primary server status in error, not connected." );
400
401        return false;
402    }
403}