This does seem to be a problem with ML04. IBM are currently looking into the problem for me. Meanwhile, see below for a script which cleans the /etc/utmp file and removes users who are no longer logged in. I've been running this for 2 weeks now without any problems.
Note from IBM:
Rebooting clears the utmp file and is the recommended method of correcting the results of corruption.
The following is an awk script that can be used to attempt to clean out bad entries in the /etc/utmp file. It may not clean certain types of corruption and a reboot will be required to clean up the file.
WARNING: Since the utmp file is constantly being changed, there is always the possibility that an attempt at correction (other than by rebooting) may corrupt the /etc/utmp file.
#!/usr/bin/ksh
# utmp_clean.awk
# 12/12/95
# awk script to clean out entries in the /etc/utmp file
# that have no current matching correct process in the
# process table.
# This MUST be run by the root user, either from the
# command line or
# from the root crontab entry.
#
if [ ! -s /usr/sbin/acct/fwtmp ]; then
# accounting not installed
print "Accounting must be installed first,fwtmp file does not exist"
exit
fi
#
SUM=1
NEWSUM=0
while [ "$SUM" != "$NEWSUM" ]; do
SUM=$(/usr/bin/sum /etc/utmp)
/usr/sbin/acct/fwtmp </etc/utmp >/tmp/utmp.out
ps au | awk '{print $2,$1,$7}' | grep -v USER >/tmp/ps.out
NEWSUM=$(/usr/bin/sum /etc/utmp)
# loop until the file is unchanged
# on a busy system, this may take a long time.
done
cat /tmp/utmp.out | awk '
# load the array
BEGIN {
counter=0
holder = ""
ss=1
while (ss == 1)
{
ss = (getline holder < "/tmp/ps.out")
if (ss == 0)
break
n=split(holder,temp)
combine=sprintf("%s %s",temp[2],temp[3])
lookup[temp[1]]=combine
}
} # end of BEGIN section
{
if ((length($4) == 1) && ($4 == 7))
{
ps_name=lookup[$5]
if (length(ps_name) > 0)
{
#found a ps table entry with same pid
# entry needs to be checked for accuracy
#only if the name and tty match, write the entry
utmp_name=sprintf("%s %s",$1,$2)
if (ps_name == utmp_name)
print $0
}
}
else # Not an entry to look at, just pass it along
{
print $0
}
}' > /tmp/utmp.tmp
/usr/sbin/acct/fwtmp -ic </tmp/utmp.tmp >/tmp/utmp.new
# Only if the /etc/utmp file is still unchanged from when
# we last looked will the file be overwritten with the
# updated copy.
# WARNING WARNING WARNING
# There is a chance that this step may corrupt the
# /etc/utmp file if a process changes it after we look
# and before we can write it.
CURRENTSUM=$(/usr/bin/sum /etc/utmp)
if [ "$CURRENTSUM" = "$SUM" ]; then
/usr/bin/cp /tmp/utmp.new /etc/utmp
print "utmp successfully updated on "$(date)
else
print "utmp was too busy on "$(date)" to update now"
print "try again later"
fi
rm /tmp/ps.out
rm /tmp/utmp.out
rm /tmp/utmp.tmp
rm /tmp/utmp.new