#!/bin/sh
#
# NOTE: keep the following two files in the repository in sync:
#    opennet/workarounds/on_workaround_wifi_master_hangs
#    opennet/packages/on-core/files/etc/cron.5mins/on_workaround_wifi_master_hangs
#
# In der ath9k-Version der Firmware v0.5.1 bleibt der Master einer Verbindung manchmal haengen.
# Symptome: alle Clients trennen sich nach einer Weile. Zuvor bleibt der TX-Zaehler auf der Master-Seite stehen.
# Lösung: "wifi" für neue Initialisierung des Masters ausführen
#
# Dieses Skript darf häufig ausgeführt werden (z.B. im Minuten- oder Viertelstundentakt).
#
# Das Skript prüft folgende Voraussetzungen:
#    - das Interface ist als master konfiguriert
#    - das Interface wird für OLSR-Routing verwendet (um regelmäßigen Verkehr sicherzustellen)
#    - das Interface war irgendwann (seit dem letzten Booten) mit Clients verbunden
#
# Ein Reset wird ausgelöst, falls:
#    - seit dem letzten Aufruf kein Verkehr über das Interface floss
#    - seit dem letzten Reset genügend Zeit vergangen ist
#      (abhängig vom Zeitpunkt der letzten Verbindung eines Clients)
#

set -eu

WIFI_DEVICE="wlan0"
LAST_PEER_TRAFFIC_COUNTER_FILENAME_PATTERN="/tmp/$(basename "$0")-${WIFI_DEVICE}-%s.cache"
RESET_TIMESTAMP_FILE="/tmp/$(basename "$0")-${WIFI_DEVICE}.timestamp_last_reset"
MINIMUM_RESET_PERIOD_SECONDS=900
SEEN_CLIENTS_TIMESTAMP_FILE="/tmp/$(basename "$0")-${WIFI_DEVICE}.timestamp_seen_clients"


is_wifi_device() {
	iwinfo "$WIFI_DEVICE" info >/dev/null 2>&1
}


is_wifi_master() {
	iwinfo "$WIFI_DEVICE" info | grep -qw "Mode: Master"
}

is_ath10k_device() {                                                                                                                     
        #wenn dies ein ath10k Geraet ist, dann soll der Workaround nicht gestartet werden                                                
        lsmod | grep -q ath10k                                                                                                           
} 

is_olsr_interface() {
	grep -wqs "$WIFI_DEVICE" /var/etc/olsrd.conf
}


update_file_timestamp() {
	local filename="$1"
	cut -f 1 -d . /proc/uptime >"$filename"
}


update_seen_clients_timestamp_if_stations_are_connected() {
	if iwinfo "$WIFI_DEVICE" assoc | grep -q dBm; then
		update_file_timestamp "$SEEN_CLIENTS_TIMESTAMP_FILE"
	fi
}


get_file_timestamp_age_minutes() {
	local filename="$1"
	local stored_timestamp
	local now
	if [ -s "$filename" ]; then
		stored_timestamp=$(cat "$filename")
		now=$(cut -f 1 -d . /proc/uptime)
		# schon abgelaufen?
		echo "$(( (stored_timestamp - now) / 60 ))"
	fi
}


get_peer_traffic_counts() {
	# direction should be "RX" or "TX"
	local direction="$1"
	local rx_count
	rx_count=$(iwinfo "$WIFI_DEVICE" assoclist \
		| grep -w "$direction:" \
		| sed 's/^.*\s\([0-9]\+\) Pkts.*$/\1/' \
		| sort -n
	)
	echo "${rx_count:-0}"
}


is_peer_traffic_count_for_direction_stale() {
	# direction should be "RX" or "TX"
	local direction="$1"
	local cache_filename last_counts current_counts cache_was_missing
	# shellcheck disable=SC2059
	cache_filename=$(printf "$LAST_PEER_TRAFFIC_COUNTER_FILENAME_PATTERN" "$direction")
	if [ -e "$cache_filename" ]; then
		last_counts=$(cat "$cache_filename")
		cache_was_missing=0
	else
		last_counts=
		cache_was_missing=1
	fi
	current_counts=$(get_peer_traffic_counts "$direction")
	# update caches and timestamps
	echo "$current_counts" >"$cache_filename"
	# the first run should not be considered "stale"
	[ "$cache_was_missing" = "1" ] && return 1
	# no change -> the peer traffic seems to be stale
	[ "$last_counts" = "$current_counts" ]
}


is_reset_allowed() {
	local last_seen_clients_timestamp_age_hours reset_timestamp_age_minutes
	last_seen_clients_timestamp_age_hours=$(get_file_timestamp_age_minutes "$SEEN_CLIENTS_TIMESTAMP_FILE" | awk '{print int($1/60)}')
	# there was never a successful connection: reset is not allowed
	[ -z "$last_seen_clients_timestamp_age_hours" ] && exit 1
	# the minimum reset period depends on the age of the "last client seen" timestamp
	if [ "$last_seen_clients_timestamp_age_hours" -lt 1 ]; then
		minimum_reset_period_minutes=15
	elif [ "$last_seen_clients_timestamp_age_hours" -lt 3 ]; then
		minimum_reset_period_minutes=30
	elif [ "$last_seen_clients_timestamp_age_hours" -lt 6 ]; then
		minimum_reset_period_minutes=60
	elif [ "$last_seen_clients_timestamp_age_hours" -lt 24 ]; then
		minimum_reset_period_minutes=180
	elif [ "$last_seen_clients_timestamp_age_hours" -lt 168 ]; then
		minimum_reset_period_minutes=1440
	else
		# we did not see successful connections for quite some time - no more resets ...
		return 1
	fi
	# exit if there was a reset before and the reset timeout is not expired, yet
	reset_timestamp_age_minutes=$(get_file_timestamp_age_minutes "$MINIMUM_RESET_PERIOD_SECONDS")
	# there was no reset before: reset is allowed
	[ -z "$reset_timestamp_age_minutes" ] && return 0
	# a reset is allowed, if the reset timeout is expired
	[ "$reset_timestamp_age_minutes" -ge "$minimum_reset_period_minutes" ]
}


reset_wireless_interfaces() {
	# reset the wireless interface
	on-function add_banner_event "wifi master hangs"
	update_file_timestamp "$RESET_TIMESTAMP_FILE"
	# es scheint Situationen zu geben, in denen hostapd sich selbst blockiert
	# siehe https://dev.opennet-initiative.de/ticket/184
	killall -q -TERM hostapd || true
	wifi
}


# check conditions for this workaround
is_wifi_device || exit 0
is_wifi_master || exit 0
is_ath10k_device && exit 0
# We do not want to reset interfaces due to the lack of traffic (e.g. local wifi interfaces).
# Thus we insist on OLSR being used on this interface.  This guarantees permanent traffic flow.
is_olsr_interface || exit 0

update_seen_clients_timestamp_if_stations_are_connected

# no action is required, if we never saw any connection
[ -e "$SEEN_CLIENTS_TIMESTAMP_FILE" ] || exit 0

# one symptom of the problem is a stuck RX or TX queue (visible in the peer-based packet counters)
# Up to 2020 we encountered only the symptom of the RX queue being stuck.
# In 2020 we also noticed a stuck TX queue on AP1.52.
if is_peer_traffic_count_for_direction_stale "RX" || is_peer_traffic_count_for_direction_stale "TX"; then
	if is_reset_allowed; then
		reset_wireless_interfaces
	fi
fi
