1 package org.apache.commons.jcs.auxiliary.remote;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import java.io.IOException;
23 import java.util.List;
24 import java.util.ListIterator;
25
26 import org.apache.commons.jcs.auxiliary.AbstractAuxiliaryCacheMonitor;
27 import org.apache.commons.jcs.auxiliary.remote.behavior.IRemoteCacheAttributes;
28 import org.apache.commons.jcs.engine.CacheStatus;
29 import org.apache.commons.jcs.engine.behavior.ICache;
30
31 /**
32 * The RemoteCacheFailoverRunner tries to establish a connection with a failover
33 * server, if any are defined. Once a failover connection is made, it will
34 * attempt to replace the failover with the primary remote server.
35 * <p>
36 * It works by switching out the RemoteCacheNoWait inside the Facade.
37 * <p>
38 * Client (i.e.) the CompositeCache has reference to a RemoteCacheNoWaitFacade.
39 * This facade is created by the RemoteCacheFactory. The factory maintains a set
40 * of managers, one for each remote server. Typically, there will only be one
41 * manager.
42 * <p>
43 * If you use multiple remote servers, you may want to set one or more as
44 * failovers. If a local cache cannot connect to the primary server, or looses
45 * its connection to the primary server, it will attempt to restore that
46 * Connection in the background. If failovers are defined, the Failover runner
47 * will try to connect to a failover until the primary is restored.
48 *
49 */
50 public class RemoteCacheFailoverRunner<K, V> extends AbstractAuxiliaryCacheMonitor
51 {
52 /** The facade returned to the composite cache. */
53 private final RemoteCacheNoWaitFacade<K, V> facade;
54
55 /** Factory instance */
56 private final RemoteCacheFactory cacheFactory;
57
58 /**
59 * Constructor for the RemoteCacheFailoverRunner object. This allows the
60 * FailoverRunner to modify the facade that the CompositeCache references.
61 *
62 * @param facade the facade the CompositeCache talks to.
63 * @param cacheFactory the cache factory instance
64 */
65 public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade<K, V> facade, RemoteCacheFactory cacheFactory )
66 {
67 super("JCS-RemoteCacheFailoverRunner");
68 this.facade = facade;
69 this.cacheFactory = cacheFactory;
70 setIdlePeriod(20000L);
71 }
72
73 /**
74 * Clean up all resources before shutdown
75 */
76 @Override
77 protected void dispose()
78 {
79 // empty
80 }
81
82 /**
83 * do actual work
84 */
85 @Override
86 protected void doWork()
87 {
88 // empty
89 }
90
91
92 /**
93 * Main processing method for the RemoteCacheFailoverRunner object.
94 * <p>
95 * If we do not have a connection with any failover server, this will try to
96 * connect one at a time. If no connection can be made, it goes to sleep for
97 * a while (20 seconds).
98 * <p>
99 * Once a connection with a failover is made, we will try to reconnect to
100 * the primary server.
101 * <p>
102 * The primary server is the first server defines in the FailoverServers
103 * list.
104 */
105 @Override
106 public void run()
107 {
108 // start the main work of connecting to a failover and then restoring
109 // the primary.
110 connectAndRestore();
111
112 if ( log.isInfoEnabled() )
113 {
114 int failoverIndex = facade.getAuxiliaryCacheAttributes().getFailoverIndex();
115 log.info( "Exiting failover runner. Failover index = " + failoverIndex);
116
117 if ( failoverIndex <= 0 )
118 {
119 log.info( "Failover index is <= 0, meaning we are not connected to a failover server." );
120 }
121 else if ( failoverIndex > 0 )
122 {
123 log.info( "Failover index is > 0, meaning we are connected to a failover server." );
124 }
125 // log if we are allright or not.
126 }
127 }
128
129 /**
130 * This is the main loop. If there are failovers defined, then this will
131 * continue until the primary is re-connected. If no failovers are defined,
132 * this will exit automatically.
133 */
134 private void connectAndRestore()
135 {
136 IRemoteCacheAttributes rca0 = facade.getAuxiliaryCacheAttributes();
137
138 do
139 {
140 log.info( "Remote cache FAILOVER RUNNING." );
141
142 // there is no active listener
143 if ( !allright.get() )
144 {
145 // Monitor each RemoteCacheManager instance one after the other.
146 // Each RemoteCacheManager corresponds to one remote connection.
147 List<RemoteLocation> failovers = rca0.getFailovers();
148 // we should probably check to see if there are any failovers,
149 // even though the caller
150 // should have already.
151
152 if ( failovers == null )
153 {
154 log.warn( "Remote is misconfigured, failovers was null." );
155 return;
156 }
157 else if ( failovers.size() == 1 )
158 {
159 // if there is only the primary, return out of this
160 log.info( "No failovers defined, exiting failover runner." );
161 return;
162 }
163
164 int fidx = rca0.getFailoverIndex();
165 log.debug( "fidx = " + fidx + " failovers.size = " + failovers.size() );
166
167 // shouldn't we see if the primary is backup?
168 // If we don't check the primary, if it gets connected in the
169 // background,
170 // we will disconnect it only to put it right back
171 ListIterator<RemoteLocation> i = failovers.listIterator(fidx); // + 1; // +1 skips the primary
172 if ( log.isDebugEnabled() )
173 {
174 log.debug( "starting at failover i = " + i.nextIndex() );
175 }
176
177 // try them one at a time until successful
178 for ( ; i.hasNext() && !allright.get();)
179 {
180 RemoteLocation server = i.next();
181 if ( log.isDebugEnabled() )
182 {
183 log.debug( "Trying server [" + server + "] at failover index i = " + i );
184 }
185
186 RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
187 rca.setRemoteLocation(server);
188 RemoteCacheManager rcm = cacheFactory.getManager( rca );
189
190 if ( log.isDebugEnabled() )
191 {
192 log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
193 }
194
195 if (rcm != null)
196 {
197 // add a listener if there are none, need to tell rca
198 // what number it is at
199 ICache<K, V> ic = rcm.getCache( rca );
200 if ( ic.getStatus() == CacheStatus.ALIVE )
201 {
202 // may need to do this more gracefully
203 log.debug( "resetting no wait" );
204 facade.restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
205 rca0.setFailoverIndex( i.nextIndex() );
206
207 if ( log.isDebugEnabled() )
208 {
209 log.debug( "setting ALLRIGHT to true" );
210 if ( i.hasPrevious() )
211 {
212 log.debug( "Moving to Primary Recovery Mode, failover index = " + i.nextIndex() );
213 }
214 else
215 {
216 log.debug( "No need to connect to failover, the primary server is back up." );
217 }
218 }
219
220 allright.set(true);
221
222 if ( log.isInfoEnabled() )
223 {
224 log.info( "CONNECTED to host = [" + rca.getRemoteLocation() + "]" );
225 }
226 }
227 }
228 }
229 }
230 // end if !allright
231 // get here if while index >0 and allright, meaning that we are
232 // connected to some backup server.
233 else
234 {
235 if ( log.isDebugEnabled() )
236 {
237 log.debug( "ALLRIGHT is true " );
238 }
239 if ( log.isInfoEnabled() )
240 {
241 log.info( "Failover runner is in primary recovery mode. Failover index = "
242 + rca0.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
243 }
244 }
245
246 boolean primaryRestoredSuccessfully = false;
247 // if we are not connected to the primary, try.
248 if ( rca0.getFailoverIndex() > 0 )
249 {
250 primaryRestoredSuccessfully = restorePrimary();
251 if ( log.isDebugEnabled() )
252 {
253 log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
254 }
255 }
256
257 if ( !primaryRestoredSuccessfully )
258 {
259 // Time driven mode: sleep between each round of recovery
260 // attempt.
261 try
262 {
263 log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
264 + idlePeriod + " milliseconds." );
265 Thread.sleep( idlePeriod );
266 }
267 catch ( InterruptedException ex )
268 {
269 // ignore;
270 }
271 }
272
273 // try to bring the listener back to the primary
274 }
275 while ( rca0.getFailoverIndex() > 0 || !allright.get() );
276 // continue if the primary is not restored or if things are not allright.
277 }
278
279 /**
280 * Try to restore the primary server.
281 * <p>
282 * Once primary is restored the failover listener must be deregistered.
283 * <p>
284 * The primary server is the first server defines in the FailoverServers
285 * list.
286 *
287 * @return boolean value indicating whether the restoration was successful
288 */
289 private boolean restorePrimary()
290 {
291 IRemoteCacheAttributes rca0 = facade.getAuxiliaryCacheAttributes();
292 // try to move back to the primary
293 RemoteLocation server = rca0.getFailovers().get(0);
294
295 if ( log.isInfoEnabled() )
296 {
297 log.info( "Trying to restore connection to primary remote server [" + server + "]" );
298 }
299
300 RemoteCacheAttributes rca = (RemoteCacheAttributes) rca0.clone();
301 rca.setRemoteLocation(server);
302 RemoteCacheManager rcm = cacheFactory.getManager( rca );
303
304 if (rcm != null)
305 {
306 // add a listener if there are none, need to tell rca what number it
307 // is at
308 ICache<K, V> ic = rcm.getCache( rca );
309 // by default the listener id should be 0, else it will be the
310 // listener
311 // Originally associated with the remote cache. either way is fine.
312 // We just don't want the listener id from a failover being used.
313 // If the remote server was rebooted this could be a problem if new
314 // locals were also added.
315
316 if ( ic.getStatus() == CacheStatus.ALIVE )
317 {
318 try
319 {
320 // we could have more than one listener registered right
321 // now.
322 // this will not result in a loop, only duplication
323 // stop duplicate listening.
324 if ( facade.getPrimaryServer() != null && facade.getPrimaryServer().getStatus() == CacheStatus.ALIVE )
325 {
326 int fidx = rca0.getFailoverIndex();
327
328 if ( fidx > 0 )
329 {
330 RemoteLocation serverOld = rca0.getFailovers().get(fidx);
331
332 if ( log.isDebugEnabled() )
333 {
334 log.debug( "Failover Index = " + fidx + " the server at that index is ["
335 + serverOld + "]" );
336 }
337
338 if ( serverOld != null )
339 {
340 // create attributes that reflect the
341 // previous failed over configuration.
342 RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) rca0.clone();
343 rcaOld.setRemoteLocation(serverOld);
344 RemoteCacheManager rcmOld = cacheFactory.getManager( rcaOld );
345
346 if ( rcmOld != null )
347 {
348 // manager can remove by name if
349 // necessary
350 rcmOld.removeRemoteCacheListener( rcaOld );
351 }
352 if ( log.isInfoEnabled() )
353 {
354 log.info( "Successfully deregistered from FAILOVER remote server = "
355 + serverOld );
356 }
357 }
358 }
359 else if ( fidx == 0 )
360 {
361 // this should never happen. If there are no
362 // failovers this shouldn't get called.
363 if ( log.isDebugEnabled() )
364 {
365 log.debug( "No need to restore primary, it is already restored." );
366 return true;
367 }
368 }
369 else if ( fidx < 0 )
370 {
371 // this should never happen
372 log.warn( "Failover index is less than 0, this shouldn't happen" );
373 }
374 }
375 }
376 catch ( IOException e )
377 {
378 // TODO, should try again, or somehow stop the listener
379 log.error("Trouble trying to deregister old failover listener prior to restoring the primary = "
380 + server, e );
381 }
382
383 // Restore primary
384 // may need to do this more gracefully, letting the failover finish in the background
385 RemoteCacheNoWait<K, V> failoverNoWait = facade.getPrimaryServer();
386
387 // swap in a new one
388 facade.restorePrimaryServer((RemoteCacheNoWait<K, V>) ic);
389 rca0.setFailoverIndex( 0 );
390
391 if ( log.isInfoEnabled() )
392 {
393 String message = "Successfully reconnected to PRIMARY remote server. Substituted primary for failoverNoWait ["
394 + failoverNoWait + "]";
395 log.info( message );
396
397 if ( facade.getCacheEventLogger() != null )
398 {
399 facade.getCacheEventLogger().logApplicationEvent( "RemoteCacheFailoverRunner", "RestoredPrimary", message );
400 }
401 }
402 return true;
403 }
404 }
405
406 // else all right
407 // if the failover index was at 0 here, we would be in a bad
408 // situation, unless there were just
409 // no failovers configured.
410 if ( log.isDebugEnabled() )
411 {
412 log.debug( "Primary server status in error, not connected." );
413 }
414
415 return false;
416 }
417 }