[Bug 10035] New: rsync hangs in solaris

classic Classic list List threaded Threaded
8 messages Options
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] New: rsync hangs in solaris

samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

           Summary: rsync hangs in solaris
           Product: rsync
           Version: 3.0.9
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: normal
          Priority: P5
         Component: core
        AssignedTo: [hidden email]
        ReportedBy: [hidden email]
         QAContact: [hidden email]


Using Solaris SunOS 5.10 in two machines we notice from time to time rsync
hanging. Below are the results from truss on two pids from one client rsync and
the results on the server side from 4 different pids as we rsync from two
different clients. Please let me know of anything else you would like me to try
on my end. All servers are running the same rsync version:
$ /opt/csw/bin/rsync --version
rsync  version 3.0.9  protocol version 30
Copyright (C) 1996-2011 by Andrew Tridgell, Wayne Davison, and others.
Web site: http://rsync.samba.org/
Capabilities:
    64-bit files, 64-bit inums, 32-bit timestamps, 64-bit long ints,
    socketpairs, hardlinks, symlinks, IPv6, batchfiles, inplace,
    append, ACLs, no xattrs, iconv, no symtimes


A typical client interaction:
10530:  pollsys(0x08047850, 2, 0x00000000, 0x00000000)  = 1
10530:  read(4, " BF9CEC9AD931BD5E4A78FE8".., 8192)     = 128
10530:  pollsys(0x08047850, 3, 0x00000000, 0x00000000)  = 1
10530:  write(7, " P\0\0\n r s y n c   e r".., 84)      = 84
10530:  pollsys(0x08047850, 2, 0x00000000, 0x00000000)  = 1
10530:  read(4, "93B8 #BA 784F9CF 918 .B6".., 8192)     = 128
10530:  pollsys(0x08047850, 3, 0x00000000, 0x00000000)  = 1
10530:  write(7, " M\0\0\t [ s e n d e r ]".., 81)      = 81
10530:  pollsys(0x08047850, 2, 0x00000000, 0x00000000)  = 1
10530:  read(4, "\n98C7 lC2D2FF k jED8AEE".., 8192)     = 128
10530:  close(7)                                        = 0
10530:  close(6)                                        = 0
10530:  close(8)                                        = 0
10530:  pollsys(0x08047850, 1, 0x00000000, 0x00000000)  = 1
10530:  write(4, "9094 t PF4958E9A > 41C ^".., 32)      = 32
10530:  ioctl(0, TCGETA, 0x08047988)                    Err#22 EINVAL
10530:  fcntl(0, F_GETFL)                               = 130
10530:  fcntl(0, F_SETFL, FWRITE)                       = 0
10530:  ioctl(1, TCGETA, 0x08047988)                    Err#22 EINVAL
10530:  fcntl(1, F_GETFL)                               = 130
10530:  fcntl(1, F_SETFL, FWRITE)                       = 0
10530:  ioctl(2, TCGETA, 0x08047988)                    Err#22 EINVAL
10530:  fcntl(2, F_GETFL)                               = 2
10530:  shutdown(4, SHUT_RDWR, SOV_DEFAULT)             = 0
10530:  close(4)                                        = 0
10530:  close(5)                                        = 0
10530:  _exit(30)
10529:      Received signal #18, SIGCLD, in write() [caught]
10529:        siginfo: SIGCLD CLD_EXITED pid=10530 status=0x001E
10529:  write(1, " r e c v _ f i l e _ n a".., 42)      Err#4 EINTR
10529:  lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
10529:  waitid(P_ALL, 0, 0x08045220, WEXITED|WTRAPPED|WNOHANG) = 0
10529:  waitid(P_ALL, 0, 0x08045220, WEXITED|WTRAPPED|WNOHANG) Err#10 ECHILD
10529:  setcontext(0x08045110)
10529:  pollsys(0x08045D80, 1, 0x08045E10, 0x00000000)  = 1
10529:  read(7, " (\0\0\t", 4)                          = 4
10529:  time()                                          = 1374162335
10529:  pollsys(0x08045D80, 1, 0x08045E10, 0x00000000)  = 1
10529:  read(7, " [ s e n d e r ]   m a k".., 40)       = 40
10529:  time()                                          = 1374162335
10529:  write(1, 0x080C1FE4, 40)        (sleeping...)



A typical server interaction(multiple pids):
1. After several:
24874:  pollsys(0x08045470, 1, 0x080454D0, 0x00000000)  = 0
24874:  time()                                          = 1374162034
24874:  pollsys(0x08045470, 1, 0x080454D0, 0x00000000) (sleeping...)

at timeout we get:
24830:  write(1, " M\0\0\t [ s e n d e r ]".., 81)      = 81
24830:  time()                                          = 1374162319
24830:  pollsys(0x080453E0, 0, 0x08045430, 0x00000000)  = 0
24830:  _exit(30)
24827:  waitid(P_ALL, 0, 0x08047A20,
WEXITED|WTRAPPED|WSTOPPED|WCONTINUED|WNOWAIT) = 0
24827:  times(0x08047A10)                               = 1387016957
24827:  waitid(P_ALL, 0, 0x08047990,
WEXITED|WTRAPPED|WSTOPPED|WCONTINUED|WNOHANG) = 0
24827:  times(0x08047A20)                               = 1387016957
24827:  times(0x08047A10)                               = 1387016957
24827:  waitid(P_ALL, 0, 0x08047990,
WEXITED|WTRAPPED|WSTOPPED|WCONTINUED|WNOHANG) Err#10 ECHILD
24827:  lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24827:  lwp_sigmask(SIG_SETMASK, 0x00000002, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24827:  lwp_sigmask(SIG_SETMASK, 0x00000002, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24827:  lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24827:  _exit(30)
24874:  pollsys(0x08045470, 1, 0x080454D0, 0x00000000)  = 0
24874:  time()                                          = 1374162334
24874:  sigaction(SIGUSR1, 0x08045460, 0x00000000)      = 0
24874:  sigaction(SIGUSR2, 0x08045460, 0x00000000)      = 0
24874:  pollsys(0x080418F0, 1, 0x08041950, 0x00000000)  = 1
24874:  write(1, " P\0\0\n r s y n c   e r".., 84)      = 84
24874:  time()                                          = 1374162334
24874:  pollsys(0x08041930, 1, 0x08041990, 0x00000000)  = 1
24874:  write(1, " M\0\0\t [ s e n d e r ]".., 81)      = 81
24874:  time()                                          = 1374162334
24874:  pollsys(0x080453E0, 0, 0x08045430, 0x00000000)  = 0
24874:  _exit(30)
24871:  waitid(P_ALL, 0, 0x08047A20,
WEXITED|WTRAPPED|WSTOPPED|WCONTINUED|WNOWAIT) = 0
24871:  times(0x08047A10)                               = 1387018547
24871:  waitid(P_ALL, 0, 0x08047990,
WEXITED|WTRAPPED|WSTOPPED|WCONTINUED|WNOHANG) = 0
24871:  times(0x08047A20)                               = 1387018547
24871:  times(0x08047A10)                               = 1387018547
24871:  waitid(P_ALL, 0, 0x08047990,
WEXITED|WTRAPPED|WSTOPPED|WCONTINUED|WNOHANG) Err#10 ECHILD
24871:  lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24871:  lwp_sigmask(SIG_SETMASK, 0x00000002, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24871:  lwp_sigmask(SIG_SETMASK, 0x00000002, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24871:  lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF
[0x0000FFFF]
24871:  _exit(30)

Thanks!
- Nestor

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

--- Comment #1 from Nestor Urquiza <[hidden email]> 2013-07-19 14:31:17 UTC ---
More feedback on this. I run rsync from a wrapper script. See below what I get
running rsync with -vvvv option:

[sender] make_file(filename.rsl,*,2)
recv_file_name(filename.rsl)
rsync error: timeout in data send/receive (code 30) at rsync.c(549)
[Receiver=3.0.9]
... mywrapperscript: line 9:   140 Broken Pipe             su - myuser -c
'...myscript'

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

samba-bugs
In reply to this post by samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

--- Comment #2 from Nestor Urquiza <[hidden email]> 2013-07-19 21:29:37 UTC ---
More feedback. This can be replicated I believe using the -vvvv option better.
I have found the script fails way less when using just -v. So:
rsync -e ssh -avvvz -stats --delete-after --exclude=core --timeout=1800
fails easier than:
rsync -e ssh -avz -stats --delete-after --exclude=core --timeout=1800

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

samba-bugs
In reply to this post by samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

--- Comment #3 from Nestor Urquiza <[hidden email]> 2013-07-19 23:33:11 UTC ---
Now that I removed the -vvv switch rsync hangs only at 3AM. All Solaris 10
machines run around that time something. Here is the relevant information from
root cron:

10 3 * * * /usr/sbin/logadm
15 3 * * 0 /usr/lib/fs/nfs/nfsfind
30 3 * * * [ -x /usr/lib/gss/gsscred_clean ] && /usr/lib/gss/gsscred_clean

Might this explain why rsync hangs?

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

samba-bugs
In reply to this post by samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

--- Comment #4 from Nestor Urquiza <[hidden email]> 2013-07-24 15:02:19 UTC ---
Yesterday the script actually hanged 6 times from 11PM to 5AM so it is now
discarded this is anything related to specific commands running at any specific
time. The fact that I am running it now from two different clients at the same
time and it hands exacrly at the same time from both is telling me that the
issue is most likely on the server side.

Is this enough information so far to find anything to be fixed on rsync running
on the server side?

My guess is that a hiccup in the network could be causing this but I am running
out of tools.

Someone recommended in a private email to run the below around the time the
issue happens but I cron it and it did not do any difference:
svcadm restart svc:/system/name-service-cache:default

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

samba-bugs
In reply to this post by samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

--- Comment #5 from Nestor Urquiza <[hidden email]> 2013-07-24 15:19:38 UTC ---
Here are the errors for those instances when it failed last night BTW:

1. Note that I use a 30 minutes timeout. Here are two errors where the timeout
was reached
1.1 filexyz: connection unexpectedly closed (518581065 bytes received so far)
[receiver]
rsync error: error in rsync protocol data stream (code 12) at io.c(605)
[receiver=3.0.9]
rsync: connection unexpectedly closed (56153 bytes received so far) [generator]
rsync error: timeout in data send/receive (code 30) at io.c(605)
[generator=3.0.9]
1.2 deleting fileabc
rsync error: timeout in data send/receive (code 30) at io.c(137) [sender=3.0.9]
rsync: connection unexpectedly closed (197119174 bytes received so far)
[receiver]
rsync error: error in rsync protocol data stream (code 12) at io.c(605)
[receiver=3.0.9]

2. In another 4 cases it just hanged while syncing specific files and no
timeout was reached.

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

samba-bugs
In reply to this post by samba-bugs
https://bugzilla.samba.org/show_bug.cgi?id=10035

--- Comment #6 from Nestor Urquiza <[hidden email]> 2013-07-25 17:23:06 UTC ---
I set my script which uses rsync to run every hour. Last night I logged a snoop
result (tcp sniffer) but also a netstat result every minute. It failed two
ties:

00:15 RSYNC hags for the first time.
rsync error: error in rsync protocol data stream (code 12) at io.c(605)
[receiver=3.0.9]
rsync: connection unexpectedly closed (56207 bytes received so far) [generator]
rsync error: timeout in data send/receive (code 30) at io.c(605)
[generator=3.0.9]

At this point from netstat we can see the same connection for as long as one
complete hour:
192.168.5.124.22     client.46480  49640      0 49640      0 ESTABLISHED

It is not until the next hour when that connection disappears and the new
connection from the new rsync instance shows up. Why the rsync timeout does not
work I do not know but it should have closed that connection 30 minutes after
it started:
192.168.5.124.22     client.47122  46720   5319 49640      0 ESTABLISHED

From snoop we see first a lot of PUSH:
0:17:44.19207 client -> server       TCP D=22 S=45088 Ack=2803790001
Seq=3254408116 Len=0 Win=45260
0:17:44.21747       server -> client TCP D=45088 S=22 Push Ack=3254408212
Seq=2803900013 Len=1208 Win=49640
0:17:49.14072       server -> client TCP D=45088 S=22 Ack=3254418532
Seq=2819000581 Len=1460 Win=49640
0:17:52.58905       server -> client TCP D=45088 S=22 Ack=3254425348
Seq=2829000925 Len=1460 Win=49640
0:17:52.58923 client -> server       TCP D=22 S=45088 Ack=2829000925
Seq=3254425348 Len=0 Win=46720
0:17:55.51377 client -> server       TCP D=22 S=45088 Ack=2837490005
Seq=3254431156 Len=0 Win=49640
0:17:55.51401       server -> client TCP D=45088 S=22 Ack=3254431156
Seq=2837490005 Len=1460 Win=49640
0:17:55.65287       server -> client TCP D=45088 S=22 Ack=3254431396
Seq=2837900009 Len=1460 Win=49640
0:17:56.03237       server -> client TCP D=45088 S=22 Ack=3254432164
Seq=2839000357 Len=1460 Win=49640
0:17:56.43671       server -> client TCP D=45088 S=22 Ack=3254432980
Seq=2840190009 Len=1460 Win=49640
0:17:56.43685 client -> server       TCP D=22 S=45088 Ack=2840190009
Seq=3254432980 Len=0 Win=48180
0:17:59.34139       server -> client TCP D=45088 S=22 Push Ack=3254438932
Seq=2848900045 Len=1208 Win=49640
0:17:59.37478 client -> server       TCP D=22 S=45088 Ack=2849000373
Seq=3254438980 Len=0 Win=49640
0:17:59.38230       server -> client TCP D=45088 S=22 Ack=3254438980
Seq=2849000373 Len=1460 Win=49640
0:17:59.94439       server -> client TCP D=45088 S=22 Ack=3254440132
Seq=2850690005 Len=1460 Win=49640
0:17:59.94445 client -> server       TCP D=22 S=45088 Ack=2850690005
Seq=3254440132 Len=0 Win=49640
0:18:0.37861 client -> server       TCP D=22 S=45088 Ack=2851990005
Seq=3254441044 Len=0 Win=49640
0:18:0.38616       server -> client TCP D=45088 S=22 Ack=3254441044
Seq=2851990005 Len=1460 Win=49640
0:18:1.01774       server -> client TCP D=45088 S=22 Ack=3254442340
Seq=2853890009 Len=1460 Win=49640
0:18:1.01798 client -> server       TCP D=22 S=45088 Ack=2853890009
Seq=3254442340 Len=0 Win=48180
0:18:2.72033       server -> client TCP D=45088 S=22 Push Ack=3254445796
Seq=2859000285 Len=1224 Win=49640
0:18:2.72057 client -> server       TCP D=22 S=45088 Ack=2859000285
Seq=3254445796 Len=0 Win=46720
0:18:3.01461       server -> client TCP D=45088 S=22 Ack=3254446420
Seq=2859900029 Len=1460 Win=49640
0:18:3.01479 client -> server       TCP D=22 S=45088 Ack=2859900029
Seq=3254446420 Len=0 Win=46720
0:18:6.63878       server -> client TCP D=45088 S=22 Ack=3254454772
Seq=2872190005 Len=1460 Win=49640
0:18:7.93095       server -> client TCP D=45088 S=22 Push Ack=3254457364
Seq=2875900093 Len=1224 Win=49640
0:18:8.88357       server -> client TCP D=45088 S=22 Ack=3254459476
Seq=2879000797 Len=1460 Win=49640

Then we see the server sending Syn but the client replies RST aborting such
connection attempt:
0:31:26.82366       server -> client TCP D=9000 S=65013 Syn Seq=2618246633
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:31:26.82390 client -> server       TCP D=65013 S=9000 Rst Ack=2618246634
Win=0
0:32:27.31389       server -> client TCP D=9000 S=65027 Syn Seq=2634946872
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:32:27.31414 client -> server       TCP D=65027 S=9000 Rst Ack=2634946873
Win=0
0:33:27.44815       server -> client TCP D=9000 S=65040 Syn Seq=2651282487
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:33:27.44839 client -> server       TCP D=65040 S=9000 Rst Ack=2651282488
Win=0
0:34:27.60344       server -> client TCP D=9000 S=65053 Syn Seq=2667734624
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:34:27.60369 client -> server       TCP D=65053 S=9000 Rst Ack=2667734625
Win=0
0:35:27.74829       server -> client TCP D=9000 S=65066 Syn Seq=2684174414
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:35:27.74849 client -> server       TCP D=65066 S=9000 Rst Ack=2684174415
Win=0
0:36:27.89816       server -> client TCP D=9000 S=65079 Syn Seq=2700669927
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:36:27.89839 client -> server       TCP D=65079 S=9000 Rst Ack=2700669928
Win=0
0:37:28.03938       server -> client TCP D=9000 S=65092 Syn Seq=2717056785
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:37:28.03960 client -> server       TCP D=65092 S=9000 Rst Ack=2717056786
Win=0
0:38:28.17940       server -> client TCP D=9000 S=65105 Syn Seq=2733472760
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:38:28.17964 client -> server       TCP D=65105 S=9000 Rst Ack=2733472761
Win=0
0:39:28.31650       server -> client TCP D=9000 S=65118 Syn Seq=2749919254
Len=0 Win=49640 Options=<mss 1460,nop,wscale 0,nop,nop,sackOK>
0:39:28.31672 client -> server



04:15 RSYNC issue. Nothing in netstat so I assume there was never a connection
established:
deleting ckp.9000.20130724.0/segment.107/segment.10727.portfolio.VELOCITY.19
deleting ckp.9000.20130724.0/segment.107/segment.10726.portfolio.GOLDFINCH.33
rsync error: timeout in data send/receive (code 30) at io.c(137) [sender=3.0.9]
rsync: connection unexpectedly closed (206402787 bytes received so far)
[receiver]
rsync error: error in rsync protocol data stream (code 12) at io.c(605)
[receiver=3.0.9]

Snoop shows exactly what I wrote above

--
Configure bugmail: https://bugzilla.samba.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the QA contact for the bug.
--
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html
Reply | Threaded
Open this post in threaded view
|

[Bug 10035] rsync hangs in solaris

Samba - rsync mailing list
In reply to this post by samba-bugs