Group Selection Page HELP DOC Controls Group Data Analysis Group ITDM WP76 DET Vacuum XRO EEE WP78 WP74 FXE SCS SPB MID HED SQS SXP Sample Environment Photon Commissioning Team Operation PSPO XO TS
General electronics MicroTCA EEE Electronics Lab EEE Rack Room
  MicroTCA Logbook  Not logged in ELOG logo
Message ID: 59     Entry time: 03 Dec 2013, 10:39
Author: Frank Babies 
Type: Software Changes 
Category: utcaX 
Subject: base s/w and configuration changed on all utca's 

Puppet Setup in a XFEL environment:

Basics: -     Change BIOS Settings to boot via PXE (now USB)
-    Install Ubuntu 12.04.04-server-amd64
-    Setup Lang: EN, Keyb. EN (US)
-    /etc/fstab "/ option = discard,noatime,errors=remount-ro"

-    Network: 192.168.81.xx/24 gw. 192.168.81.16
-    DNS 131.169.40.200 131.169.194.200
-    Partition: sda1 / 86GB, Partition2: sda5 / 8GB swap, option = discard,noatime
-    Enable sshd
-    Disable ipv6

net.ipv6.conf.all.disable_ipv6=1
net.ipv6.conf.default.disable_ipv6=1
net.ipv6.conf.lo.disable_ipv6=1

-    Deinstall irqbalance
-    change swappines from 60% to10% (echo "vm.swappiness=10" >> /etc/sysctl.conf)
-    User:Passwd – utcaadm:xxxxx
-     %exfl_jet            ALL=(ALL:ALL) ALL  >>  /etc/sudoers
-    nagios  ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/

-     AddressFamily inet   >>  /etc/ssh/sshd_config

 

Blacklist Modules:

 -    lp
-    ppdev
-    parport_pc
-    parport
-    pcmouse
-    soundcore
-    snd
-    snd_page_alloc
-    snd_timer
-    snd_pmc
-    snd_hwdep
-    snd_hda_codec
-    snd_hda_intel

 


After reboot setup the Sourcen configuration:

             Change the entry in the /etc/apt/sources.list:
-    deb http://doocspkgs.desy.de/pub/doocs precise main
-    deb-src http://doocspkgs.desy.de/pub/doocs precise main
-    # DESY Ubuntu Repository
-    deb  http://nims.desy.de/ubuntu precise main restricted universe multiverse
-    deb  http://nims.desy.de/ubuntu precise-updates main restricted universe multiverse
-    deb   http://nims.desy.de/ubuntu precise-security main restricted universe multiverse
-    deb-src  http://nims.desy.de/ubuntu precise main restricted universe multiverse
-    deb-src  http://nims.desy.de/ubuntu precise-updates main restricted universe multiverse
-    deb-src  http://nims.desy.de/ubuntu precise-security main restricted universe multiverse


Install other packages:
-    ntp, rsyslog, pciedev-dkms, upciedev-dkms, doocs-dirs-min, x1timer-dkms, doocs-x2timer-server, doocs-pcie-tools, build-essential, dkms, module-assistant, locate, postfix, nedit, emacs, [spd-adq-pci-dkms*.deb, adqupdater_0.13437_amd64.deb,libadq0_0.13437_amd64.deb]
-   nagios-nrpe-server nagios-plugins, autofs, subversion, libnss3-dev, pkg-config, smartmontools, hddtemp, sysstat, postfix, python-qt4, hwinfo, mc, ipmitool
-   preload, ethtool, nfs-common, openssh-server, openssh-client, krb5-user, libpam-krb5, acpi, lm-sensors, rrdtool, perl, gmetad_3.6.0-1ubuntu2_amd64.deb, ganglia-monitor-python_3.6.0-1ubuntu2_all.deb, ganglia-monitor_3.6.0-1ubuntu2_amd64.deb, libganglia1_3.6.0-1ubuntu2_amd64.deb, libconfuse0

 
Configure the packages:

HOSTS
-127.0.0.1       localhost
#-127.0.1.1       exflutca3
#-192.168.81.106  exflutca3.desy.de       exflutca3
#-192.168.81.107  exflutca3-mch.desy.de   exflutca3-mch
-192.168.81.200  exflutcadev.desy.de     exflutcadev
# The following lines are desirable for IPv6 capable hosts
-#::1     ip6-localhost ip6-loopback
-#fe00::0 ip6-localnet
-#ff00::0 ip6-mcastprefix
-#ff02::1 ip6-allnodes
-#ff02::2 ip6-allrouters

NTP
-    Configuration from  "/etc/ntp.conf" change:
-    server ntp.desy.de
-    server ntp1.desy.de
-    server ntp2.desy.de
-    server ntp3.desy.de
-    fudge 127.127.1.0 stratum 13
-    restrict 127.0.0.1
-    restrict ::1
     STOP NTP -> ntpdate -> start ntp: "/usr/sbin/ntpd -p /var/run/ntpd.pid -g -u 106:113"

nagios:

-   Copy attached file nrpe.cfg to /etc/nagios

-   Copy rest of attached files to /usr/lib/nagios/plugins

-   Make them executable

-   run visudo and add that line:

-   nagios  ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/

 

-   service nagios-nrpe-server restart

change setup in ceck_temp.sh:

# Warning threshold
thresh_warn=80
# Critical threshold
thresh_crit=90
# Hardware to monitor
sensor=Core

!!! Make sure that the right values are placed for the HDD (SDD) in /etc/nagios/nrpe.cfg

command[check_hddtemp]=sudo /usr/lib/nagios/plugins/check_hddtemp.sh /dev/sda 45 60


autofs:
-     mkdir /data /devhome
-     chmod 777 /data /devhome
-    put the next line in to "/etc/auto.master"
-     /-    /etc/auto.data
-    /-   /etc/auto.devhome

-    put the next line in to "/etc/auto.devhome

-    /devhome    -rw,soft,nfsvers=3  192.168.81.200:/devhome


-    put the next line in to "/etc/auto.data"
-    /data   -rw,soft,nfsvers=3   131.169.247.61:/data

-    and the last is:
-    change the timeout to 60 seconds if you want in "/etc/default/autofs"
-    restart the autofs "/etc/init.d/autofs restart"


Syslog-ng
-    Configuration from “/etc/syslog-ng/syslog-ng.conf” change
-    Put in: udp();
 

rsyslog

- *.* @192.168.81.200 >> /etc/rsyslog.d/50-default.conf

on Craits:

-   destination d_exflutcadev { udp("192.168.81.200" port(514)); };
-   log { source(s_src); filter(f_messages); destination (d_exflutcadev); };

Grub
-    /etc/default/grub
-    #GRUB_CMDLINE_LINUX_DEFAULT="pciehp.pciehp_force=1 pciehp.pciehp_debug=1"
  -    GRUB_CMDLINE_LINUX="pciehp.pciehp_force=1 pciehp.pciehp_debug=1 pcie_ports=native console=tty0 console=ttyS0,115200 rootwait rootdelay=90"
-      GRUB_TERMINAL="console"
-      GRUB_SERIAL_COMMAND="serial --unit=0"


---------
-    commands:
-    update-grub  update-grub2
-    grub-install /dev/sda
----------

PROXY

put in the lines:

http_proxy="http://exflwgs06.desy.de:3128"
https_proxy="http://exflwgs06.desy.de:3128"
ftp_proxy="http://exflwgs06.desy.de:3128"
 

in /etc/environment

 
Modules
-    /etc/modules
-    insert the modules:
-    pciedev
-    upciedev

-    x1timer

-    spd_adq_pci


--------------
-    command: depmod –a

NEW:  /etc/init/tty0.conf
# tty0 - getty
#
# This service maintains a getty on tty1 from the point the system is
# started until it is shut down again.

start on stopped rc RUNLEVEL=[2345] and (
            not-container or
            container CONTAINER=lxc or
            container CONTAINER=lxc-libvirt)

stop on runlevel [!2345]

respawn
exec /sbin/getty -8 115200 tty0

NEW: /etc/init/ttyS0.conf
# ttyS0 - getty
#
# This service maintains a getty on tty1 from the point the system is
# started until it is shut down again.

start on stopped rc RUNLEVEL=[2345] and (
            not-container or
            container CONTAINER=lxc or
            container CONTAINER=lxc-libvirt)

stop on runlevel [!2345]

respawn
exec /sbin/getty -8 115200 ttyS0

/etc/smartd.conf:

/dev/sda -a -o on -S on -s (S/../.././03|L/../../6/03) -m root -M exec /usr/share/smartmontools/smartd-runner

to /etc/smartd.conf and started the smartd server.

and coment out the line:  start_smartd=yes

in: /etc/defaults/smartmontools

 

/etc/hddtemp:

Put in the Line:

sudo echo '"Samsung SSD 840 EVO 120G B" 190 C "Samsung SSD 840 EVO 120GB"' >> /etc/hddtemp.db

sudo echo "Samsung SSD 840 PRO Serise" 190  C  "Samsung SSD 840 PRO Serise" >> /etc/hddtemp.db

 

===============================================
Home
-    mkdir /data
-      command: tune2fs -e remount-ro /dev/sda1
 

Group's added:
-  3555 exfel
-  5478 exfl_jet

users added:
- 23081 exfel babies
- 21502 exfel ballakk
- 21370 exfel baskaran
- 8323  exfel coppola
- 19134 exfel emotuk
- 2512  exfel esenov
- 19446 exfel fernands
- 19499 exfel gessler
- 21126 exfel mdonato
- 23421 exfel sotoudin
- 23972 exfel utcaadm
- 20145 exfel abeckman


Networkcard driver “e1000e”

-    driver with tar and gzip unpacking, make and makeinstall.

Install scripts:
-    /usr/local/bin/myri-irq-bind.sh

Configure scripts:
 
-    myri-irq-bind.sh:
-    “crontab –e” insert the line “@reboot sleep 120 && /usr/local/bin/myri-irq-bind.sh eth0 8”
-    “crontab –e” insert the line “@reboot sleep 120 && /usr/local/bin/x2timer-task-bind.sh"
-    “crontab –e” insert the line “@reboot sleep 120 && /usr/local/bin/eth_push.sh"
-    “crontab –e” insert the line “2 0 * * 7 /sbin/fstrim -v /"
-    “chmod 755 /usr/local/bin/myri-irq-bind.sh
-    “chmod 755 /usr/local/bin/x2timer-task-bind.sh"
-    “chmod 755 /usr/local/bin/eth_push.sh"
 

x2timer-task-bind.sh

#!/bin/bash
taskset -p 04 `cat /export/doocs/server/x2timer_server/x2timer_server.PID`

In the configfile "/etc/init/portmap.conf" I change the option from "-w" to "-i -w" on all crates, now the x2timer comes up.

 

eth_push.sh

#!/bin/bash
/sbin/ethtool -G eth0 rx 4096 tx 4096

  

Attachment 1: myri-irq-bind.sh  1 kB  Uploaded 04 Dec 2013, 09:57  | Hide | Hide all | Show all
#!/bin/bash
#set -x

if [ $# -eq 0 ]; then
   echo "usage: msixbind.sh INTERFACE [CPU#]"
   exit 1;
fi

eth=$1
mask=$2

echo "Binding interface $eth"
pid=`pgrep irqbalance`
   if [ $? -eq 0 ];
   then
       echo "irqbalance is running! Pid = $pid"
       echo "it will undo anything done by this script"
       echo "Please kill it and re-run this script"
       exit
   fi

done=0
i=0
slice=0
start=0
num_slices=`grep "${eth}" /proc/interrupts | wc -l`
while [ $done != 1 ]
do
# one of the following, depending on which version of the driver is installed
   irq_data=`grep "${eth}:slice-${slice}" /proc/interrupts`

   if [ $? != 0 ];
   then
       if [ $i != 0 ];
       then
           exit
       fi
       irq_data=`grep "${eth}" /proc/interrupts`
       if [ $? != 0 ];
       then
           exit
       fi
   fi
   irq=`echo $irq_data |  awk '{print $1 ; }' | sed -e 's/://g'`
   file="/proc/irq/${irq}/smp_affinity"
   printf "Binding slice %2d to CPU %2d: writing mask 0x%08x to $file\n" $slice $mask $mask
   printf "%x" $mask > $file
   i=`expr $i + 1`
   slice=`expr $slice + 1`
   if [ $slice -eq $num_slices ];
   then
       exit
   fi
done
Attachment 2: nrpe.cfg  7 kB  Uploaded 15 Dec 2014, 13:57  | Show | Hide all | Show all
Attachment 3: check_temp.sh  7 kB  Uploaded 15 Dec 2014, 13:57  | Hide | Hide all | Show all
#!/bin/bash

################################################################################
#                                                                              #
#  Copyright (C) 2011 Jack-Benny Persson <jake@cyberinfo.se>                   #
#                                                                              #
#   This program is free software; you can redistribute it and/or modify       #
#   it under the terms of the GNU General Public License as published by       #
#   the Free Software Foundation; either version 2 of the License, or          #
#   (at your option) any later version.                                        #
#                                                                              #
#   This program is distributed in the hope that it will be useful,            #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#   GNU General Public License for more details.                               #
#                                                                              #
#   You should have received a copy of the GNU General Public License          #
#   along with this program; if not, write to the Free Software                #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  #
#                                                                              #
################################################################################

###############################################################################
#                                                                             #	
# Nagios plugin to monitor CPU and M/B temperature with sensors.              #
# Written in Bash (and uses sed & awk).                                       #
# Latest version of check_temp can be found at the below URL:                 #
# https://github.com/jackbenny/check_temp                                     #
#                                                                             #
# If you are having problems getting it to work, check the instructions in    #
# the README first. It walks you though install lm-sensors and getting it to  #
# display sensor data.                                                        #
#                                                                             #
###############################################################################

VERSION="Version 0.8"
AUTHOR="(c) 2011 Jack-Benny Persson (jack-benny@cyberinfo.se)"

# Sensor program
SENSORPROG=/usr/bin/sensors

# Exit codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

shopt -s extglob

#### Functions ####

# Print version information
print_version()
{
	printf "\n\n$0 - $VERSION\n"
}

#Print help information
print_help()
{
	print_version
	printf "$AUTHOR\n"
	printf "Monitor temperature with the use of sensors\n"
/bin/cat <<EOT

Options:
-h
   Print detailed help screen
-V
   Print version information
-v
   Verbose output

--sensor WORD
   Set what to monitor, for example CPU or MB (or M/B). Check sensors for the
   correct word. Default is CPU.
-w INTEGER
   Exit with WARNING status if above INTEGER degres
-c INTEGER
   Exit with CRITICAL status if above INTEGER degres
EOT
}


###### MAIN ########

# Warning threshold
thresh_warn=60
# Critical threshold
thresh_crit=80
# Hardware to monitor
sensor=Core

# See if we have sensors program installed and can execute it
if [[ ! -x "$SENSORPROG" ]]; then
	printf "\nIt appears you don't have sensors installed in $SENSORPROG\n"
	exit $STATE_UNKOWN
fi

# Parse command line options
while [[ -n "$1" ]]; do 
   case "$1" in

       -h | --help)
           print_help
           exit $STATE_OK
           ;;

       -V | --version)
           print_version
           exit $STATE_OK
           ;;

       -v | --verbose)
           : $(( verbosity++ ))
           shift
           ;;

       -w | --warning)
           if [[ -z "$2" ]]; then
               # Threshold not provided
               printf "\nOption $1 requires an argument"
               print_help
               exit $STATE_UNKNOWN
            elif [[ "$2" = +([0-9]) ]]; then
               # Threshold is an integer 
               thresh=$2
            else
               # Threshold is not an integer
               printf "\nThreshold must be an integer"
               print_help
               exit $STATE_UNKNOWN
           fi
           thresh_warn=$thresh
	   shift 2
           ;;

       -c | --critical)
           if [[ -z "$2" ]]; then
               # Threshold not provided
               printf "\nOption '$1' requires an argument"
               print_help
               exit $STATE_UNKNOWN
            elif [[ "$2" = +([0-9]) ]]; then
               # Threshold is an integer 
               thresh=$2
            else
               # Threshold is not an integer
               printf "\nThreshold must be an integer"
               print_help
               exit $STATE_UNKNOWN
           fi
           thresh_crit=$thresh
	   shift 2
           ;;

       -\?)
           print_help
           exit $STATE_OK
           ;;

       --sensor)
	   if [[ -z "$2" ]]; then
		printf "\nOption $1 requires an argument"
		print_help
		exit $STATE_UNKNOWN
	   fi
		sensor=$2
           shift 2
           ;;

       *)
           printf "\nInvalid option '$1'"
           print_help
           exit $STATE_UNKNOWN
           ;;
   esac
done


# Check if a sensor were specified
if [[ -z "$sensor" ]]; then
	# No sensor to monitor were specified
	printf "\nNo sensor specified"
	print_help
	exit $STATE_UNKNOWN
fi


#Get the temperature
TEMP=`${SENSORPROG} | grep "$sensor" | cut -d+ -f2 | cut -c1-2 | head -n1`
#Old way - Get the temperature
#TEMP=`${SENSORPROG} | grep "$sensor" | awk '{print $3}' | cut -c2-3 | head -n1`


# Check if the thresholds have been set correctly
if [[ -z "$thresh_warn" || -z "$thresh_crit" ]]; then
	# One or both thresholds were not specified
	printf "\nThreshold not set"
	print_help
	exit $STATE_UNKNOWN
  elif [[ "$thresh_crit" -lt "$thresh_warn" ]]; then
	# The warning threshold must be lower than the critical threshold
	printf "\nWarning temperature should be lower than critical"
	print_help
	exit $STATE_UNKNOWN
fi


# Verbose output
if [[ "$verbosity" -ge 1 ]]; then
   /bin/cat <<__EOT
Debugging information:
  Warning threshold: $thresh_warn 
  Critical threshold: $thresh_crit
  Verbosity level: $verbosity
  Current $sensor temperature: $TEMP
__EOT
printf "\n  Temperature lines directly from sensors:\n"
${SENSORPROG}
printf "\n\n"
fi

# Get performance data for Nagios "Performance Data" field
PERFDATA=`${SENSORPROG} | grep "$sensor" | head -n1`


# And finally check the temperature against our thresholds
if [[ "$TEMP" != +([0-9]) ]]; then
	# Temperature not found for that sensor
	printf "No data found for that sensor ($sensor)\n"
	exit $STATE_UNKNOWN
	
  elif [[ "$TEMP" -gt "$thresh_crit" ]]; then
	# Temperature is above critical threshold
	echo "$sensor CRITICAL - Temperature is $TEMP | $PERFDATA"
	exit $STATE_CRITICAL

  elif [[ "$TEMP" -gt "$thresh_warn" ]]; then
	# Temperature is above warning threshold
	echo "$sensor WARNING - Temperature is $TEMP | $PERFDATA"
	exit $STATE_WARNING

  else
	# Temperature is ok
	echo "$sensor OK - Temperature is $TEMP | $PERFDATA"
	exit $STATE_OK
fi
exit 3
Attachment 4: check_hddtemp.sh  2 kB  Uploaded 15 Dec 2014, 13:58  | Hide | Hide all | Show all
#!/bin/bash
#
# USAGE:
# ./check_hddtemp.sh <device> <warn> <crit>
# Nagios script to get the temperatue of HDD from hddtemp
#
# You may have to let nagios run this script as root
# This is how the sudoers file looks in my debian system:
# nagios  ALL=(root) NOPASSWD:/usr/lib/nagios/plugins/check_hddtemp.sh
#
# Version 1.0

OK=0
WARNING=1
CRITICAL=2
UNKNOWN=3

function usage()
{
	echo "Usage: ./check_hddtemp.sh <device> <warn> <crit>"
}

function check_root()
{
	# make sure script is running as root
	if [ `whoami` != root ]; then
		echo "UNKNOWN: please make sure script is running as root"
		exit $UNKNOWN
	fi
}
function check_arg()
{
	# make sure you supplied all 3 arguments
	if [ $# -ne 3 ]; then
		usage
		exit $OK
	fi
}
function check_device()
{
	# make sure device is a special block
	if [ ! -b $DEVICE ];then
		echo "UNKNOWN: $DEVICE is not a block special file"
		exit $UNKNOWN
	fi
}
function check_warn_vs_crit()
{
	# make sure CRIT is larger than WARN
	if [ $WARN -ge $CRIT ];then
		echo "UNKNOWN: WARN value may not be greater than or equal the CRIT value"
		exit $UNKNOWN
	fi
}

function init()
{
check_root
check_arg $*
check_device
check_warn_vs_crit
}

function get_hddtemp()
{
	# gets temperature and stores it in $HEAT
	# and make sure we get a numeric output
	if [ -x $HDDTEMP ];then
		HEAT=`$HDDTEMP $DEVICE -n`
		case "$HEAT" in
		[0-9]* )
			echo "do nothing" > /dev/null
			;;
		* )
			echo "UNKNOWN: Could not get temperature from: $DEVICE"
			exit $UNKNOWN
			;;
		esac
	else
		echo "UNKNOWN: cannot execute $HDDTEMP"
		exit $UNKNOWN
	fi
}
function check_heat()
{
	# checks temperature and replies according to $CRIT and $WARN
	if [ $HEAT -lt $WARN ];then
		echo "OK: Temperature is below warn treshold ($DEVICE is $HEAT)"
		exit $OK
	elif [ $HEAT -lt $CRIT ];then
		echo "WARNING: Temperature is above warn treshold ($DEVICE is $HEAT)"
		exit $WARNING
	elif [ $HEAT -ge $CRIT ];then
		echo "CRITICAL: Temperature is above crit treshold ($DEVICE is $HEAT)"
		exit $CRITICAL
	else
		echo "UNKNOWN: This error message should never occur, if it does happen anyway, get a new cup of coffee and fix the code :)"
		exit $UNKNOWN
	fi
		
}

# -- Main -- #

HDDTEMP=/usr/sbin/hddtemp
DEVICE=$1
WARN=$2
CRIT=$3


init $*
get_hddtemp
check_heat
ELOG V3.1.4-7c3fd00