swarm icon indicating copy to clipboard operation
swarm copied to clipboard

netsplit problem using Swarm in Erlang project

Open wjqwsp opened this issue 7 years ago • 2 comments

I am an erlang user and the ability to resolve conflicts in netsplit is the reason why I use Swarm.However, I ran into a problem when I test the netsplit situation. I start 2 nodes, node A in server 1 and node B in server 2. At first, I register a process in node A:

'Elixir.Swarm':'register_name'(monitor_master, monitor_master_sup, register, [monitor_master]).

And the process "monitor_master" starts successfully. Then, I create the netsplit situation:

erlang:disconnect_node('node@host2')

Since node A and nodeB are disconnected, Now I have two instances running parallelly in node A and node B. However, when I connect the two nodes again:

net_kernel:connect_node('node@host2')

The two nodes connect each other successfully, but the two process instances keep running! I think Swarm can resolve this conflict automatically, and kill the process running in node B. Do I need to write some configs to achieve this? Or do I misunderstand the doc in Swarm? I also have some confusion about the erlang API, since the doc only shows me the elixir one.This is part of my gen_server which need to be started by the Swarm and the supervisor:

-module(monitor_master).
-author("root").

-behaviour(gen_server).

%% API
-export([start_link/1]).

%% gen_server callbacks
-export([init/1,
    handle_call/3,
    handle_cast/2,
    handle_info/2,
    terminate/2,
    code_change/3]).

-define(SERVER, ?MODULE).

-record(state, {size = 0, start = 0}).

%%%===================================================================
%%% API
%%%===================================================================

%%--------------------------------------------------------------------
%% @doc
%% Starts the server
%%
%% @end
%%--------------------------------------------------------------------
start_link(Name) ->
    gen_server:start_link({local, Name}, ?MODULE, [], []).

%%%===================================================================
%%% gen_server callbacks
%%%===================================================================

%%--------------------------------------------------------------------
%% @private
%% @doc
%% Initializes the server
%%
%% @spec init(Args) -> {ok, State} |
%%                     {ok, State, Timeout} |
%%                     ignore |
%%                     {stop, Reason}
%% @end
%%--------------------------------------------------------------------
-spec(init(Args :: term()) ->
    {ok, State :: #state{}} | {ok, State :: #state{}, timeout() | hibernate} |
    {stop, Reason :: term()} | ignore).
init([]) ->
    timer:send_interval(600000, calculate_speed),
    {ok, #state{start = get_timestamp(erlang:timestamp())}}.

%%--------------------------------------------------------------------
%% @private
%% @doc
%% Handling call messages
%%
%% @end
%%--------------------------------------------------------------------
-spec(handle_call(Request :: term(), From :: {pid(), Tag :: term()},
    State :: #state{}) ->
    {reply, Reply :: term(), NewState :: #state{}} |
    {reply, Reply :: term(), NewState :: #state{}, timeout() | hibernate} |
    {noreply, NewState :: #state{}} |
    {noreply, NewState :: #state{}, timeout() | hibernate} |
    {stop, Reason :: term(), Reply :: term(), NewState :: #state{}} |
    {stop, Reason :: term(), NewState :: #state{}}).
handle_call({swarm, begin_handoff}, _from, State) ->
    lager:info("begin handoff"),
    {reply, {resume, 5}, State};
handle_call(_Request, _From, State) ->
    {reply, ok, State}.

%%--------------------------------------------------------------------
%% @private
%% @doc
%% Handling cast messages
%%
%% @end
%%--------------------------------------------------------------------
-spec(handle_cast(Request :: term(), State :: #state{}) ->
    {noreply, NewState :: #state{}} |
    {noreply, NewState :: #state{}, timeout() | hibernate} |
    {stop, Reason :: term(), NewState :: #state{}}).
handle_cast({swarm, end_handoff, Delay}, State) ->
    lager:info("end handoff, delay:~p", [Delay]),
    {noreply, State};
handle_cast({swarm, resolve_conflict, Delay}, State) ->
    lager:info("conflit occur, delay:~p", [Delay]),
    {noreply, State};
handle_cast(_Request, State) ->
    {noreply, State}.
-module(monitor_master_sup).

-behaviour(supervisor).

%% API
-export([start_link/0]).

%% Supervisor callbacks
-export([init/1, register/1]).

-define(SERVER, ?MODULE).

%%====================================================================
%% API functions
%%====================================================================

start_link() ->
    supervisor:start_link({local, ?SERVER}, ?MODULE, []).

%%====================================================================
%% Supervisor callbacks
%%====================================================================

%% Child :: {Id,StartFunc,Restart,Shutdown,Type,Modules}
init([]) ->
    RestartStrategy = simple_one_for_one,
    MaxRestarts = 0,
    MaxSecondsBetweenRestarts = 1,

    SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},

    Restart = temporary,
    Shutdown = 2000,
    Type = worker,

    MonitorMaster = {monitor_master, {monitor_master, start_link, []},
        Restart, Shutdown, Type, [monitor_master]},

    {ok, {SupFlags, [MonitorMaster]}}.

register(Name) ->
    {ok, _pid} = supervisor:start_child(?MODULE, [Name]).

I am not sure whether I translate the elixir API correctly, because I cannot see any logs printed when the conflict happen. How can I resolve conflicts in netsplit situations? Are there any configs I miss? It would be grateful if anyone can give me some tips.Thank you!

wjqwsp avatar Apr 27 '18 01:04 wjqwsp

Can you post the logs?

derekkraan avatar Apr 30 '18 07:04 derekkraan

@derekkraan Thank you, I have found that when my cluster size is lager than 2, all the callbacks work well. But when the cluster size is 2, the callbacks are never been called. As for the situation that the process is not been killed when the netsplit is healed, I have found that the registered name is actually been removed, but the process itself keep alive even though other processes cannot find it by the registered name.Maybe that's the way how Swarm works.

wjqwsp avatar May 05 '18 08:05 wjqwsp