1. #!/usr/local/bin/bash
  2. # spinpid2.sh for dual fan zones.
  3. VERSION="2020-08-20"
  4. # Run as superuser. See notes at end.
  5. ##############################################
  6. #
  7. # Settings sourced from spinpd2.config
  8. # in same directory as the script
  9. #
  10. ##############################################
  11. DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
  12. source "$DIR/spinpid2.config"
  13. ##############################################
  14. # function get_disk_name
  15. # Get disk name from current LINE of DEVLIST
  16. ##############################################
  17. # The awk statement works by taking $LINE as input,
  18. # setting '(' as a _F_ield separator and taking the second field it separates
  19. # (ie after the separator), passing that to another awk that uses
  20. # ',' as a separator, and taking the first field (ie before the separator).
  21. # In other words, everything between '(' and ',' is kept.
  22. # camcontrol output for disks on HBA seems to change every version,
  23. # so need 2 options to get ada/da disk name.
  24. function get_disk_name {
  25. if [[ $LINE == *": /dev/f"* ]] ; then # for ([a]da#,pass#)
  26. # DEVID=$(echo "$LINE" | awk -F '(' '{print $2}' | awk -F ',' '{print$1}')
  27. DEVID=$(echo "$LINE" | awk '{print substr($1,1, length($1) - 1)}')
  28. else # for (pass#,[a]da#)
  29. # DEVID=$(echo "$LINE" | awk -F ',' '{print $2}' | awk -F ')' '{print$1}')
  30. DEVID=$(echo "$LINE" | awk '{print substr($2,6, length($0))}')
  31. fi
  32. }
  33. ############################################################
  34. # function print_header
  35. # Called when script starts and each quarter day
  36. ############################################################
  37. function print_header {
  38. DATE=$(date +"%A, %b %d")
  39. let "SPACES = DEVCOUNT * 5 + 42" # 5 spaces per drive
  40. printf "\n%-*s %3s %16s %29s \n" $SPACES "$DATE" "CPU" "New_Fan%" "New_RPM_____________________"
  41. echo -n " "
  42. while read -r LINE ; do
  43. get_disk_name
  44. printf "%-5s" "$DEVID"
  45. done <<< "$DEVLIST" # while statement works on DEVLIST
  46. printf "%4s %5s %6s %6s %6s %3s %-7s %s %-4s %5s %5s %5s %5s %5s" "Tmax" "Tmean" "ERRc" "P" "D" "TEMP" "MODE" "CPU" "PER" "FANA" "FAN1" "FAN2" "FAN3" "FAN4"
  47. }
  48. #################################################
  49. # function read_fan_data
  50. #################################################
  51. function read_fan_data {
  52. # If set by user, read duty cycles, convert to decimal. Otherwise,
  53. # the script will assume the duty cycles are what was last set.
  54. if [ $HOW_DUTY == 1 ] ; then
  55. DUTY_CPU=$($IPMITOOL raw 0x30 0x70 0x66 0 $ZONE_CPU) # in hex with leading space
  56. DUTY_CPU=$((0x$(echo $DUTY_CPU))) # strip leading space and decimalize
  57. DUTY_PER=$($IPMITOOL raw 0x30 0x70 0x66 0 $ZONE_PER)
  58. DUTY_PER=$((0x$(echo $DUTY_PER)))
  59. fi
  60. # Read fan mode, convert to decimal, get text equivalent.
  61. MODE=$($IPMITOOL raw 0x30 0x45 0) # in hex with leading space
  62. MODE=$((0x$(echo $MODE))) # strip leading space and decimalize
  63. # Text for mode
  64. case $MODE in
  65. 0) MODEt="Standard" ;;
  66. 1) MODEt="Full" ;;
  67. 2) MODEt="Optimal" ;;
  68. 4) MODEt="HeavyIO" ;;
  69. esac
  70. # Get reported fan speed in RPM from sensor data repository.
  71. # Takes the pertinent FAN line, then 3 to 5 consecutive digits
  72. SDR=$($IPMITOOL sdr)
  73. FAN1=$(echo "$SDR" | grep "FAN1" | grep -Eo '[0-9]{3,5}')
  74. FAN2=$(echo "$SDR" | grep "FAN2" | grep -Eo '[0-9]{3,5}')
  75. FAN3=$(echo "$SDR" | grep "FAN3" | grep -Eo '[0-9]{3,5}')
  76. FAN4=$(echo "$SDR" | grep "FAN4" | grep -Eo '[0-9]{3,5}')
  77. FANA=$(echo "$SDR" | grep "FANA" | grep -Eo '[0-9]{3,5}')
  78. }
  79. ##############################################
  80. # function CPU_check_adjust
  81. # Get CPU temp. Calculate a new DUTY_CPU.
  82. # Send to function adjust_fans.
  83. ##############################################
  84. function CPU_check_adjust {
  85. # Another IPMITOOL method of checking CPU temp:
  86. # CPU_TEMP=$($IPMITOOL sdr | grep "CPU Temp" | grep -Eo '[0-9]{2,5}')
  87. if [[ $CPU_TEMP_SYSCTL == 1 ]]; then
  88. # Find hottest CPU core
  89. MAX_CORE_TEMP=0
  90. for CORE in $(seq 0 $CORES)
  91. do
  92. CORE_TEMP="$(sysctl -n dev.cpu.${CORE}.temperature | awk -F '.' '{print$1}')"
  93. if [[ $CORE_TEMP -gt $MAX_CORE_TEMP ]]; then MAX_CORE_TEMP=$CORE_TEMP; fi
  94. done
  95. CPU_TEMP=$MAX_CORE_TEMP
  96. else
  97. # CPU_TEMP=$($IPMITOOL sensor get "CPU Temp" | awk '/Sensor Reading/ {print $4}')
  98. CPU_TEMP=$(sensors | awk '/Package id/ {print substr($4,2, length($4) - 6)}')
  99. fi
  100. DUTY_CPU_LAST=$DUTY_CPU
  101. # This will break if settings have non-integers
  102. let DUTY_CPU="$(( (CPU_TEMP - CPU_REF) * CPU_SCALE + DUTY_CPU_MIN ))"
  103. # Don't allow duty cycle outside min-max
  104. if [[ $DUTY_CPU -gt $DUTY_CPU_MAX ]]; then DUTY_CPU=$DUTY_CPU_MAX; fi
  105. if [[ $DUTY_CPU -lt $DUTY_CPU_MIN ]]; then DUTY_CPU=$DUTY_CPU_MIN; fi
  106. adjust_fans $ZONE_CPU $DUTY_CPU $DUTY_CPU_LAST
  107. # Use this short CPU cycle to also allow PER fans to come down
  108. # if PD < 0 and drives are at least 1 C below setpoint
  109. # (e.g, after high demand or if 100% at startup).
  110. # With multiple CPU cycles and no new drive temps, this will
  111. # drive fans to DUTY_PER_MIN, but that's ok if drives are that cool.
  112. # However, this is experimental.
  113. if [[ PD -lt 0 && (( $(bc <<< "scale=2; $Tmean < ($SP-1)") == 1 )) ]]; then
  114. DUTY_PER_LAST=$DUTY_PER
  115. DUTY_PER=$(( DUTY_PER + PD ))
  116. # Don't allow duty cycle below min
  117. if [[ $DUTY_PER -lt $DUTY_PER_MIN ]]; then DUTY_PER=$DUTY_PER_MIN; fi
  118. # pass to the function adjust_fans
  119. adjust_fans $ZONE_PER $DUTY_PER $DUTY_PER_LAST
  120. fi
  121. sleep $CPU_T
  122. if [ $CPU_LOG_YES == 1 ] ; then
  123. print_interim_CPU | tee -a $CPU_LOG >/dev/null
  124. fi
  125. # This will call user-defined function if it exists (see config).
  126. declare -f -F Post_CPU_check_adjust >/dev/null && Post_CPU_check_adjust
  127. }
  128. ##############################################
  129. # function DRIVES_check_adjust
  130. # Print time on new log line.
  131. # Go through each drive, getting and printing
  132. # status and temp. Calculate max and mean
  133. # temp, then calculate PID and new duty.
  134. # Call adjust_fans.
  135. ##############################################
  136. function DRIVES_check_adjust {
  137. Tmax=0; Tsum=0 # initialize drive temps for new loop through drives
  138. i=0 # initialize count of spinning drives
  139. while read -r LINE ; do
  140. get_disk_name
  141. /sbin/smartctl -a -n standby "/dev/$DEVID" > /var/tempfile
  142. RETURN=$? # have to preserve return value or it changes
  143. BIT0=$(( RETURN & 1 ))
  144. BIT1=$(( RETURN & 2 ))
  145. if [ $BIT0 -eq 0 ]; then
  146. if [ $BIT1 -eq 0 ]; then
  147. STATUS="*" # spinning
  148. else # drive found but no response, probably standby
  149. STATUS="_"
  150. fi
  151. else # smartctl returns 1 (00000001) for missing drive
  152. STATUS="?"
  153. fi
  154. TEMP=""
  155. # Update temperatures each drive; spinners only
  156. if [ "$STATUS" == "*" ] ; then
  157. # Taking 10th space-delimited field for most SATA:
  158. if grep -Fq "Temperature_Celsius" /var/tempfile ; then
  159. TEMP=$( cat /var/tempfile | grep "Temperature_Celsius" | awk '{print $10}')
  160. # Else assume SAS, their output is:
  161. # Transport protocol: SAS (SPL-3) . . .
  162. # Current Drive Temperature: 45 C
  163. else
  164. TEMP=$( cat /var/tempfile | grep "Drive Temperature" | awk '{print $4}')
  165. fi
  166. let "Tsum += $TEMP"
  167. if [[ $TEMP > $Tmax ]]; then Tmax=$TEMP; fi;
  168. let "i += 1"
  169. fi
  170. printf "%s%-2d " "$STATUS" "$TEMP"
  171. done <<< "$DEVLIST"
  172. DUTY_PER_LAST=$DUTY_PER
  173. # if no disks are spinning
  174. if [ $i -eq 0 ]; then
  175. Tmean=""; Tmax=""; P=""; D=""; ERRc=""
  176. DUTY_PER=$DUTY_PER_MIN
  177. else
  178. # summarize, calculate PD and print Tmax and Tmean
  179. # Need ERRc value if all drives had been spun down last time
  180. if [[ $ERRc == "" ]]; then ERRc=0; fi
  181. Tmean=$(bc <<< "scale=2; $Tsum / $i")
  182. ERRp=$ERRc # save previous error before calculating current
  183. ERRc=$(bc <<< "scale=2; ($Tmean - $SP) / 1" )
  184. P=$(bc <<< "scale=3; ($Kp * $ERRc) / 1" )
  185. D=$(bc <<< "scale=4; $Kd * ($ERRc - $ERRp) / $DRIVE_T" )
  186. PD=$(bc <<< "$P + $D" ) # add corrections
  187. # round for printing
  188. Tmean=$(printf %0.2f "$Tmean" )
  189. ERRc=$(printf %0.2f "$ERRc" )
  190. P=$(printf %0.2f "$P" )
  191. D=$(printf %0.2f "$D" )
  192. PD=$(printf %0.f "$PD" ) # must be integer for duty
  193. let "DUTY_PER = $DUTY_PER_LAST + $PD"
  194. # Don't allow duty cycle outside min-max
  195. if [[ $DUTY_PER -gt $DUTY_PER_MAX ]]; then DUTY_PER=$DUTY_PER_MAX; fi
  196. if [[ $DUTY_PER -lt $DUTY_PER_MIN ]]; then DUTY_PER=$DUTY_PER_MIN; fi
  197. fi
  198. # DIAGNOSTIC variables - uncomment for troubleshooting:
  199. # printf "\n DUTY_PER=%s, DUTY_PER_LAST=%s, DUTY=%s, Tmean=%s, ERRp=%s \n" "${DUTY_PER:---}" "${DUTY_PER_LAST:---}" "${DUTY:---}" "${Tmean:---}" $ERRp
  200. # pass to the function adjust_fans
  201. adjust_fans $ZONE_PER $DUTY_PER $DUTY_PER_LAST
  202. # DIAGNOSTIC variables - uncomment for troubleshooting:
  203. # printf "\n DUTY_PER=%s, DUTY_PER_LAST=%s, DUTY=%s, Tmean=%s, ERRp=%s \n" "${DUTY_PER:---}" "${DUTY_PER_LAST:---}" "${DUTY:---}" "${Tmean:---}" $ERRp
  204. # print current Tmax, Tmean
  205. printf "^%-3s %5s" "${Tmax:---}" "${Tmean:----}"
  206. # This will call user-defined function if it exists (see config).
  207. declare -f -F Post_DRIVES_check_adjust >/dev/null && Post_DRIVES_check_adjust
  208. }
  209. ##############################################
  210. # function adjust_fans
  211. # Zone, new duty, and last duty are passed as parameters
  212. ##############################################
  213. function adjust_fans {
  214. # parameters passed to this function
  215. ZONE=$1
  216. DUTY=$2
  217. DUTY_LAST=$3
  218. # Change if different from last duty, or the first time.
  219. if [[ $DUTY -ne $DUTY_LAST ]] || [[ FIRST_TIME -eq 1 ]]; then
  220. # Set new duty cycle. "echo -n ``" prevents newline generated in log
  221. echo -n "$($IPMITOOL raw 0x30 0x70 0x66 1 "$ZONE" "$DUTY")"
  222. fi
  223. FIRST_TIME=0
  224. }
  225. ##############################################
  226. # function print_interim_CPU
  227. # Sent to a separate file by the call
  228. # in CPU_check_adjust{}
  229. ##############################################
  230. function print_interim_CPU {
  231. RPM=$($IPMITOOL sdr | grep "$RPM_CPU" | grep -Eo '[0-9]{2,5}')
  232. # print time on each line
  233. TIME=$(date "+%H:%M:%S"); echo -n "$TIME "
  234. printf "%7s %5d %5d \n" "${RPM:----}" "$CPU_TEMP" "$DUTY"
  235. }
  236. ##############################################
  237. # function mismatch_test
  238. # Tests for mismatch
  239. # between fan duty and fan RPMs
  240. ##############################################
  241. function mismatch_test {
  242. MISMATCH=0; MISMATCH_CPU=0; MISMATCH_PER=0
  243. # ${!RPM_*} gets updated value of the variable RPM_* points to
  244. if [[ (DUTY_CPU -ge 95 && ${!RPM_CPU} -lt RPM_CPU_MAX) || (DUTY_CPU -lt 25 && ${!RPM_CPU} -gt RPM_CPU_30) ]] ; then
  245. MISMATCH=1; MISMATCH_CPU=1
  246. printf "\n%s\n" "Mismatch between CPU Duty and RPMs -- DUTY_CPU=$DUTY_CPU; RPM_CPU=${!RPM_CPU}"
  247. fi
  248. if [[ (DUTY_PER -ge 95 && ${!RPM_PER} -lt RPM_PER_MAX) || (DUTY_PER -lt 25 && ${!RPM_PER} -gt RPM_PER_30) ]] ; then
  249. MISMATCH=1; MISMATCH_PER=1
  250. printf "\n%s\n" "Mismatch between PER Duty and RPMs -- DUTY_PER=$DUTY_PER; RPM_PER=${!RPM_PER}"
  251. fi
  252. }
  253. ##############################################
  254. # function force_set_fans
  255. # Used each cycle if a mismatch is detected and
  256. # after BMC reset
  257. ##############################################
  258. function force_set_fans {
  259. if [ $MISMATCH_CPU == 1 ]; then
  260. FIRST_TIME=1 # forces adjust_fans to do it
  261. adjust_fans $ZONE_CPU $DUTY_CPU $DUTY_CPU_LAST
  262. echo "Attempting to fix CPU mismatch "
  263. sleep 5
  264. fi
  265. if [ $MISMATCH_PER == 1 ]; then
  266. FIRST_TIME=1
  267. adjust_fans $ZONE_PER $DUTY_PER $DUTY_PER_LAST
  268. echo "Attempting to fix PER mismatch "
  269. sleep 5
  270. fi
  271. }
  272. ##############################################
  273. # function reset_bmc
  274. # Triggered after 2 attempts to fix mismatch
  275. # between fan duty and fan RPMs
  276. ##############################################
  277. function reset_bmc {
  278. TIME=$(date "+%H:%M:%S"); echo -n "$TIME "
  279. echo -n "Resetting BMC after second attempt failed to fix mismatch -- "
  280. $IPMITOOL bmc reset cold
  281. sleep 120
  282. read_fan_data
  283. }
  284. #####################################################
  285. # SETUP
  286. # All this happens only at the beginning
  287. # Initializing values, list of drives, print header
  288. #####################################################
  289. # Print settings at beginning of log
  290. printf "\n****** SETTINGS ******\n"
  291. printf "CPU zone %s; Peripheral zone %s\n" $ZONE_CPU $ZONE_PER
  292. printf "CPU fans min/max duty cycle: %s/%s\n" $DUTY_CPU_MIN $DUTY_CPU_MAX
  293. printf "PER fans min/max duty cycle: %s/%s\n" $DUTY_PER_MIN $DUTY_PER_MAX
  294. printf "CPU fans - measured RPMs at 30%% and 100%% duty cycle: %s/%s\n" $RPM_CPU_30 $RPM_CPU_MAX
  295. printf "PER fans - measured RPMs at 30%% and 100%% duty cycle: %s/%s\n" $RPM_PER_30 $RPM_PER_MAX
  296. printf "Drive temperature setpoint (C): %s\n" $SP
  297. printf "Kp=%s, Kd=%s\n" $Kp $Kd
  298. printf "Drive check interval (main cycle; minutes): %s\n" $DRIVE_T
  299. printf "CPU check interval (seconds): %s\n" $CPU_T
  300. printf "CPU reference temperature (C): %s\n" $CPU_REF
  301. printf "CPU scalar: %s\n" $CPU_SCALE
  302. if [ $HOW_DUTY == 1 ] ; then
  303. printf "Reading fan duty from board \n"
  304. else
  305. printf "Assuming fan duty as set \n" ; fi
  306. # Check if CPU Temp is available via sysctl (will likely fail in a VM)
  307. CPU_TEMP_SYSCTL=$(($(sysctl -a | grep dev.cpu.0.temperature | wc -l) > 0))
  308. if [[ $CPU_TEMP_SYSCTL == 1 ]]; then
  309. printf "Getting CPU temperatures via sysctl \n"
  310. # Get number of CPU cores to check for temperature
  311. # -1 because numbering starts at 0
  312. CORES=$(($(sysctl -n hw.ncpu)-1))
  313. else
  314. # printf "Getting CPU temperature via ipmitool (sysctl not available) \n"
  315. printf "Getting CPU temperature via lm-sensors \n"
  316. fi
  317. CPU_LOOPS=$( bc <<< "$DRIVE_T * 60 / $CPU_T" ) # Number of whole CPU loops per drive loop
  318. I=0; ERRc=0 # Initialize errors to 0
  319. FIRST_TIME=1
  320. # Alter RPM thresholds to allow some slop
  321. RPM_CPU_30=$(echo "scale=0; 1.2 * $RPM_CPU_30 / 1" | bc)
  322. RPM_CPU_MAX=$(echo "scale=0; 0.8 * $RPM_CPU_MAX / 1" | bc)
  323. RPM_PER_30=$(echo "scale=0; 1.2 * $RPM_PER_30 / 1" | bc)
  324. RPM_PER_MAX=$(echo "scale=0; 0.8 * $RPM_PER_MAX / 1" | bc)
  325. # Get list of drives
  326. DEVLIST1=$(inxi -D)
  327. # Remove lines with non-spinning devices; edit as needed
  328. # You could use another strategy, e.g., find something in the camcontrol devlist
  329. # output that is unique to the drives you want, for instance only WDC drives:
  330. # if [[ $LINE != *"WDC"* ]] . . .
  331. DEVLIST="$(echo "$DEVLIST1"|sed '/KINGSTON/d;/ADATA/d;/SanDisk/d;/OCZ/d;/LSI/d;/EXP/d;/INTEL/d;/TDKMedia/d;/SSD/d;/VMware/d;/Enclosure/d;/Card/d;/Flash/d;/Virtual/d;/total/d')"
  332. DEVCOUNT=$(echo "$DEVLIST" | wc -l)
  333. # These variables hold the name of the other variables, whose
  334. # value will be obtained by indirect reference. Don't ask.
  335. if [[ ZONE_PER -eq 0 ]]; then
  336. RPM_PER=FAN1
  337. RPM_CPU=FANA
  338. else
  339. RPM_PER=FANA
  340. RPM_CPU=FAN1
  341. fi
  342. read_fan_data
  343. # If mode not Full, set it to avoid BMC changing duty cycle
  344. # Need to wait a tick or it may not get next command
  345. # "echo -n" to avoid annoying newline generated in log
  346. if [[ MODE -ne 1 ]]; then
  347. echo -n "$($IPMITOOL raw 0x30 0x45 1 1)"
  348. sleep 1
  349. fi
  350. # Need to start fan duty at a reasonable value if fans are
  351. # going fast or we didn't read DUTY_* in read_fan_data
  352. # (second test is TRUE if DUTY_* is unset).
  353. if [[ ${!RPM_PER} -ge RPM_PER_MAX || -z ${DUTY_PER+x} ]]; then
  354. echo -n "$($IPMITOOL raw 0x30 0x70 0x66 1 $ZONE_PER 50)"
  355. DUTY_PER=50; sleep 1
  356. fi
  357. if [[ ${!RPM_CPU} -ge RPM_CPU_MAX || -z ${DUTY_CPU+x} ]]; then
  358. echo -n "$($IPMITOOL raw 0x30 0x70 0x66 1 $ZONE_CPU 50)"
  359. DUTY_CPU=50; sleep 1
  360. fi
  361. # Before starting, go through the drives to report if
  362. # smartctl return value indicates a problem (>2).
  363. # Use -a so that all return values are available.
  364. while read -r LINE ; do
  365. get_disk_name
  366. /sbin/smartctl -a -n standby "/dev/$DEVID" > /var/tempfile
  367. if [ $? -gt 2 ]; then
  368. printf "\n"
  369. printf "*******************************************************\n"
  370. printf "* WARNING - Drive %-4s has a record of past errors, *\n" "$DEVID"
  371. printf "* is currently failing, or is not communicating well. *\n"
  372. printf "* Use smartctl to examine the condition of this drive *\n"
  373. printf "* and conduct tests. Status symbol for the drive may *\n"
  374. printf "* be incorrect (but probably not). *\n"
  375. printf "*******************************************************\n"
  376. fi
  377. done <<< "$DEVLIST"
  378. printf "\n%s %36s %s \n" "Key to drive status symbols: * spinning; _ standby; ? unknown" "Version" $VERSION
  379. print_header
  380. # for first round of printing
  381. CPU_TEMP=$(echo "$SDR" | grep "CPU Temp" | grep -Eo '[0-9]{2,5}')
  382. # Initialize CPU log
  383. if [ $CPU_LOG_YES == 1 ] ; then
  384. printf "%s \n%s \n%17s %5s %5s \n" "$DATE" "Printed every CPU cycle" $RPM_CPU "Temp" "Duty" | tee $CPU_LOG >/dev/null
  385. fi
  386. ###########################################
  387. # Main loop through drives every DRIVE_T minutes
  388. # and CPU every CPU_T seconds
  389. ###########################################
  390. while true ; do
  391. # Print header every quarter day. awk removes any
  392. # leading 0 so it is not seen as octal
  393. HM=$(date +%k%M)
  394. HM=$( echo $HM | awk '{print $1 + 0}' )
  395. R=$(( HM % 600 )) # remainder after dividing by 6 hours
  396. if (( R < DRIVE_T )); then
  397. print_header;
  398. fi
  399. #
  400. # Main stuff
  401. #
  402. echo # start new line
  403. TIME=$(date "+%H:%M:%S"); echo -n "$TIME " # print time on each line
  404. DRIVES_check_adjust # prints drive data also
  405. sleep 5 # Let fans equilibrate to duty before reading them
  406. read_fan_data
  407. printf "%7s %6s %6.6s %4s %-7s %3d %3d %6s %5s %5s %5s %5s" "${ERRc:----}" "${P:----}" "${D:----}" "$CPU_TEMP" $MODEt $DUTY_CPU $DUTY_PER "${FANA:----}" "${FAN1:----}" "${FAN2:----}" "${FAN3:----}" "${FAN4:----}"
  408. # Test loop for BMC reset. Exit loop if no mismatch found between duty and rpm,
  409. # or after 2 attempts to fix lead to bmc reset and a third attempt to fix.
  410. # This should happen after reading fans so CPU loops don't result in false mismatch.
  411. ATTEMPTS=0 # Number of attempts to fix duties
  412. mismatch_test
  413. while true; do
  414. if [ $MISMATCH == 1 ]; then
  415. force_set_fans
  416. let "ATTEMPTS += 1"
  417. read_fan_data
  418. mismatch_test
  419. else
  420. break # exit loop
  421. fi
  422. if [ ATTEMPTS == 2 ]; then
  423. if [ MISMATCH == 1 ]; then
  424. reset_bmc
  425. force_set_fans
  426. read_fan_data
  427. mismatch_test
  428. else
  429. break # exit loop
  430. fi
  431. fi
  432. if [ $ATTEMPTS == 3 ]; then
  433. break
  434. fi
  435. done
  436. # CPU loop
  437. i=0
  438. while [ $i -lt "$CPU_LOOPS" ]; do
  439. CPU_check_adjust
  440. let i=i+1
  441. done
  442. done
  443. # For SuperMicro motherboards with dual fan zones.
  444. # Adjusts fans based on drive and CPU temperatures.
  445. # Includes disks on motherboard and on HBA.
  446. # Mean drive temp is maintained at a setpoint using a PID algorithm.
  447. # CPU temp need not and cannot be maintained at a setpoint,
  448. # so PID is not used; instead fan duty cycle is simply
  449. # increased with temp using reference and scale settings.
  450. # Drives are checked and fans adjusted on a set interval, such as 5 minutes.
  451. # Logging is done at that point. CPU temps can spike much faster,
  452. # so are checked and logged at a shorter interval, such as 1-15 seconds.
  453. # CPUs with high TDP probably require short intervals.
  454. # Logs:
  455. # - Disk status (* spinning or _ standby)
  456. # - Disk temperature (Celsius) if spinning
  457. # - Max and mean disk temperature
  458. # - Temperature error and PID variables
  459. # - CPU temperature
  460. # - RPM for FANA and FAN1-4 before new duty cycles
  461. # - Fan mode
  462. # - New fan duty cycle in each zone
  463. # - In CPU log:
  464. # - RPM of the first fan in CPU zone (FANA or FAN1
  465. # - CPU temperature
  466. # - new CPU duty cycle
  467. # Relation between percent duty cycle, hex value of that number,
  468. # and RPMs for my fans. RPM will vary among fans, is not
  469. # precisely related to duty cycle, and does not matter to the script.
  470. # It is merely reported.
  471. #
  472. # Percent Hex RPM
  473. # 10 A 300
  474. # 20 14 400
  475. # 30 1E 500
  476. # 40 28 600/700
  477. # 50 32 800
  478. # 60 3C 900
  479. # 70 46 1000/1100
  480. # 80 50 1100/1200
  481. # 90 5A 1200/1300
  482. # 100 64 1300
  483. ################
  484. # Tuning Advice
  485. ################
  486. # PID tuning advice on the internet generally does not work well in this application.
  487. # First run the script spincheck.sh and get familiar with your temperature and fan variations without any intervention.
  488. # Choose a setpoint that is an actual observed Tmean, given the number of drives you have. It should be the Tmean associated with the Tmax that you want.
  489. # Start with Kp low. Find the lowest ERRc (which is Tmean - setpoint) in the output other than 0 (don't worry about sign +/-). Set Kp to 0.5 / ERRc, rounded up to an integer. My lowest ERRc is 0.14. 0.5 / 0.14 is 3.6, and I find Kp = 4 is adequate. Higher Kp will give a more aggressive response to error, but the downside may be overshooting the setpoint and oscillation. Kd offsets that, but raising them both makes things unstable and harder to tune.
  490. # Set Kd at about Kp*10
  491. # Get Tmean within ~0.3 degree of SP before starting script.
  492. # Start script and run for a few hours or so. If Tmean oscillates (best to graph it), you probably need to reduce Kd. If no oscillation but response is too slow, raise Kd.
  493. # Stop script and get Tmean at least 1 C off SP. Restart. If there is overshoot and it goes through some cycles, you may need to reduce Kd.
  494. # If you have problems, examine P and D in the log and see which is messing you up.
  495. # Uses joeschmuck's smartctl method for drive status (returns 0 if spinning, 2 in standby)
  496. # https://forums.freenas.org/index.php?threads/how-to-find-out-if-a-drive-is-spinning-down-properly.2068/#post-28451
  497. # Other method (camcontrol cmd -a) doesn't work with HBA

spinpid2.sh