parsing

Parsing authentication log with python

This simple script is just an exercise. I’m learning Python and frankly i just like to parse text files.

Code below will parse the /var/log/auth.log file and search for failed authentication attempts. For each failed attempt it will record IP address, date, account used to authenticate and remote port used to authenticate. It will then resolve the IP to hostname, generate list of distinct accounts and ports used by particular IP address. This list will be displayed at the end of script execution. Instead of showing all ports used it shows just the range from the lowest to highest. By default only first five accounts are displayed in the table (unless list of those five is longer than 30 chars – in such case the list is truncated). If you would want to display all accounts recorded you can replace the code in line 176 from this one:

parsed_accounts     = adjust_item( five_accounts,         30 )

to this one:

parsed_accounts     = item["accounts"]

The columns NOFA and NOFP are showing number of accounts and number of ports used respectively. The date showed can be read as ‘last seen’ for particular IP address.

The example output:

auth

The script:

#!/usr/bin/env python3.4

# IMPORTS
import re
import socket
import pprint
from colorama import init, Fore, Back, Style

# VARS
log_path = '/var/log/auth.log'
hosts=[]
full_hosts_data=[]
previous_ip = ""
previous_host = ""

# ADJUSTING TO FIXED LENGTH
def adjust_item( str, i ):
	if len(str) < i:
		for j in range(i-len(str)):
			str = str + " "
	return str

# AS THE NAME SAYS
def get_hostname( ip ):
	global previous_ip
	global previous_host
	if previous_ip == ip:
		return previous_host
	else:
		try:
			new_host = socket.gethostbyaddr(ip)
			previous_ip = ip
			previous_host = new_host[0]
			return new_host[0]
		except Exception:
			new_host = ip
			previous_ip = ip
			previous_host = ip
			return new_host

# RETURNING FIRST FIVE ACCOUNTS AND NUMBER OF ALL ACCOUNTS TRIED
def first_5( parsed_string ):
	result_5 = ""
	count_all = 0
	if len( parsed_string.split("|") ) > 5 :
		index = 5
		for item in parsed_string.split("|"):
			if index > 0 and len(item) > 0:
				result_5 = result_5 + "|" + item
				index = index - 1
			if len(item) > 0:
				count_all = count_all + 1
	else:
		for item in parsed_string.split("|"):
			if len(item) > 0:
				result_5 = result_5 + "|" + item
				count_all = count_all + 1
	return (result_5, count_all )

# CHECKING PORT RANGE AND NUMBER OF PORTS WITH FAILED PASSWORDS
def port_parser( parsed_string):
	smallest = 66000
	largest = -1
	counter = 0
	for port in parsed_string.split("|"):
		if len(port) > 0:
			if int(port) < smallest:
				smallest = int(port)
			if int(port) > largest:
				largest = int(port)
			counter = counter + 1
	return( largest, smallest, counter )

def get_date( my_line ):
	date_words = my_line.split(":")
	date = date_words[0] +":"+ date_words[1] +":"+ ((date_words[2]).split(" "))[0]
	return date

def get_ports( my_line ):
	port_words = my_line.split(" port ")
	port = (port_words[1]).split(" ")
	return port[0]

def get_username( my_line ):
	username_words = my_line.split("invalid user ")
	username = (username_words[1]).split(" ")
	return username[0]

def get_username2( my_line ):
	username_words = line.split("Failed password for ")
	username = (username_words[1]).split(" ")
	return username[0]

def check_distinct(itemlist, my_item):
	item_exists = 0
	my_list = itemlist
	for i in my_list.split("|"):
		if i == my_item:
			item_exists = 1
	if item_exists == 0:
		my_list = my_list + "|" + my_item
	return my_list

# READ FILE
with open(log_path, 'rt') as log:
	text = log.read();

# COLLECTING HOSTS AND IPS
for line in text.split("\n"):
	if len(line) > 5:
		# PARSE LINE AND ADJUST FIELD LENGTH
		check_1 = line.find("cron:session")
		check_2 = line.find("Disconnecting")
		check_3 = line.find("Address")
		if check_1 == -1 and check_2 == -1 and check_3 == -1:
			break_in = line.find("POSSIBLE BREAK-IN ATTEMPT")
			if break_in != -1:
				words = line.split(" [")
				words2 = (words[1]).split("]")
				host = get_hostname( words2[0] )
				exists_check = 0
				for my_host in hosts:
					if my_host["ip"] == words2[0]:
						exists_check = 1
				if exists_check == 0:
					hosts.append({"ip":words2[0], "hostname":host})

for my_host in hosts:
	ports = ""
	accounts = ""
	date = ""

	for line in text.split("\n"):
		# CHECK LINES FOR FAILED PASS ATTEMPTS
		if line.find(my_host["ip"]) != -1 and line.find("Failed password") != -1:

			if line.find("Failed password for invalid ") != -1:
				username = get_username( line ) 				# GET USERNAME
			else:
				username = get_username2( line ) 				# GET USERNAME

			port = get_ports( line ) 							# GET PORT USED
			date = get_date( line ) 							# GET DATE
			ports = check_distinct(ports, port) 				# SAVE ONLY DISTINCT PORTS
			accounts = check_distinct(accounts, username )		# SAVE ONLY DISTINCT ACCOUNTS

	# SAVE ACCTUAL ATTEMPTS
	if len(ports) > 1:
		full_hosts_data.append({
			"ip":my_host["ip"],
			"hostname":my_host["hostname"],
			"accounts":accounts,
			"ports":ports,
			"date":date
		});

# PRINT TABLE HEADERS
print(
	adjust_item("DATE", 16 ),
	adjust_item("IP", 15),
	adjust_item("HOSTNAME", 40),
	adjust_item("ACCOUNTS", 30) + adjust_item("NOFA ", 4),
	adjust_item("PORT RANGE", 12),
	adjust_item("NOFP",5)
)

# GENERATING OUTPUT
# DATE             IP              HOSTNAME                                 ACCOUNTS                      NOFA  PORT RANGE   NOFP
# Jun  2 08:47:37  61.174.51.XXX   XXX.51.174.61.dial.XXX.dynamic.163data   root|admin                    2     2804 ->58246 30

for item in full_hosts_data:

	largest_port, smallest_port, port_count = port_parser(item["ports"])
	five_accounts, account_counter = first_5(item["accounts"])

	parsed_ip 			= adjust_item( item["ip"], 			15 )
	parsed_host 		= adjust_item( item["hostname"] , 	40 )
	parsed_accounts 	= adjust_item( five_accounts, 		30 )
	parsed_acounter 	= adjust_item( str(account_counter), 5 )
	parsed_portrange 	= adjust_item(str(smallest_port), 	 5 ) + "->" + adjust_item(str(largest_port) ,5 )
	parsed_port_count	= adjust_item( str(port_count), 	 5 )
	parsed_date 		= adjust_item( item["date"], 		16 )

	print(
		parsed_date[:16],
		parsed_ip, parsed_host[:40],
		parsed_accounts[1:30],
		parsed_acounter,
		parsed_portrange,
		parsed_port_count
	)

The code above is provided as is. I do not guarantee it will work in Your environment specifically. I’ve tested this on Debian Jessie (testing). Please use it at your own risk.
Obrazek

[Another] Apache access log parser in Python

I must admit – Python is mighty cool. I’ve started learning it yesterday and today I’ve managed to create a simple parser of Apache2 access log – and with colors!

Nothing fancy so i’m just going to drop it here:

 

#!/usr/bin/env python3.4

# IMPORTS
import re
import socket
from colorama import init, Fore, Back, Style

# VARS
regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
log_path = '/var/log/apache2/access.log'
previous_ip = " "
previous_host = " "

# FUNCTIONS
def adjust_item( str, i ):
	if len(str) < i:
		for j in range(i-len(str)):
			str = str + " "
	return str

def get_hostname( ip ):
	global previous_ip
	global previous_host
	if previous_ip == ip:
		return previous_host
	else:
		try:
			new_host = socket.gethostbyaddr(ip)
			previous_ip = ip
			previous_host = new_host[0]
			return new_host[0]
		except Exception:
			new_host = ip
			previous_ip = ip
			previous_host = ip
			return new_host

# READ FILE
with open(log_path, 'rt') as log:
	text = log.read();

# FOR EACH LINE
for line in text.split("\n"):
	if len(line) > 5:

		# PARSE LINE AND ADJUST FIELD LENGTH
		ip 		= adjust_item( re.match( regex, line ).group( 1 ), 15 )
		hostname 	= adjust_item( str(get_hostname(ip.strip())), 30 )
		date		= (re.match( regex, line ).group( 2 )).split(" ")[0]
		request 	= adjust_item( re.match( regex, line ).group( 3 ), 40 )
		code		= adjust_item( re.match( regex, line ).group( 4 ), 4 )
		size		= adjust_item( re.match( regex, line ).group( 5 ), 8 )
		ref		= adjust_item( re.match( regex, line ).group( 6 ), 30 )
		agent		= adjust_item( re.match( regex, line ).group( 7 ), 3 )

		# HTTP 200 OK
		if code.strip()[0] == "2":

			print( date + " " , end="")
			print( Fore.GREEN + Style.BRIGHT + code[:4] + Fore.RESET + Style.NORMAL, end="" )
			print( ip[:15] + " " , end="")
			print( hostname[:30] + " " , end="")
			print( size[:8] + " " , end="")

			# CHECK IF METHOD USED IS GET | POST
			if request[0] == "G" or request[0] == "P" :
				print( request[:40] + " " , end="")
			else:
				# OTHER METHODS PRINT IN COLOR
				print( Back.BLACK + Fore.RED + Style.DIM + request[:40] + Fore.RESET + Back.RESET + Style.NORMAL + " ", end="" )

			print( ref[:30] + " ", end="")
			print( agent[:3])

		# HTTP 300
		elif code.strip()[0] == "3":
			print( date + " " , end="")
			print( Fore.YELLOW + Style.BRIGHT + code[:4] + Fore.RESET + Style.NORMAL,  end="" )
			print( ip[:15] + " " , end="")
			print( hostname[:30] + " " , end="")
			print( size[:8] + " " , end="")
			print( request[:40] + " " , end="")
			print( ref[:30] + " ", end="")
			print( agent[:3])

		# HTTP 400
		elif code.strip()[0] == "4":
			print( date + " " , end="")
			print( Fore.BLUE + Style.BRIGHT + code[:4] + Fore.RESET + Style.NORMAL,  end="" )
			print( ip[:15] + " " , end="")
			print( hostname[:30] + " " , end="")
			print( size[:8] + " " , end="")

			# CHECK IF METHOD USED IS GET | POST
			if request[0] == "G" or request[0] == "P" :
				print( request[:40] + " " , end="")
			else:
				# OTHER METHODS PRINT IN COLOR
				new_request=Fore.RED + request[:40] + Fore.RESET
				print( new_request + " " , end="")
			print( ref[:30] + " ", end="")
			print( agent[:3])

		# HTTP 500
		elif code.strip()[0] == "5":
			print( date+ " " , end="")
			print( Fore.MAGENTA + Style.BRIGHT + code[:4] + Fore.RESET + Style.NORMAL ,  end="" )
			print( ip[:15] + " " , end="")
			print( hostname[:30] + " " , end="")
			print( size[:8] + " " , end="")
			print( request[:40] + " " , end="")
			print( ref[:30] + " ", end="")
			print( agent[:3])

		# OTHER
		else:
			print( date + " " , end="")
			print( code[:4],  end="" )
			print( ip[:15] + " " , end="")
			print( hostname[:30] + " " , end="")
			print( size[:8] + " " , end="")
			print( request[:40] + " " , end="")
			print( ref[:30] + " ", end="")
			print( agent[:3])

You will see output like this:
log

Apache access log parser

[with reverse DNS check and colors]

Nothing special here really. Just a few lines of code to make the logs review a little bit easier.

Displayed columns in order from left to right:

  • Date and time of access
  • HTTP CODE of response [200 in green, 404 in blue, rest in red]
  • IP address
  • Reverse DNS hostname [last 30 chars] [empty if NXDOMAIN]
  • Request [first 30 chars]

 


The output:

apache log parsing output


Script:


#!/bin/bash

while read line
do
        # IP
          ip=$(echo $line | cut -d " " -f 1)

        # HOST
          host=$(host $ip | cut -d " " -f 5 | tail -1)
          if [[ ${#ip} -lt 15 ]]; then
                for (( i=$(echo "15-${#ip}"|bc); i>0; i-- )) do
                        ip="$ip "
                done
          fi

        # IF I DO NOT GET DOMAIN NAME
          if [[ $(echo "$host" | grep "NXDOMAIN" | wc -l ) -ne 0 ]]; then
                host=" - "
          fi

        # EVEN UP THE HOSTNAME TO SEE LAST 30 CHARS
          if [[ ${#host} -lt 30 ]]; then
                for (( i=$(echo "30-${#host}"|bc); i>0; i-- )) do
                        host="$host "
                done
          else
                host=${host:$(echo "${#host}-30"|bc)}
          fi

          dhost="\033[01;30m$host\033[00m"

        #   DISPLAY GOOGLEBOT CUSTOM DNS
          if [[ $(echo $host | grep google |wc -l) -eq 1 ]]; then
                dhost="\033[01;30mGOOGLEBOT\033[00m                     "
          fi

        # DATE
          date=$(echo $line | cut -d "[" -f 2 | cut -d "]" -f 1 | cut -d "+" -f 1)
                day=$(echo $date | cut -d ":" -f 1 | tr -d " ")
                dtime=$(echo $date | cut -d ":" -f 2- | tr -d " ")

        # REQUEST
          req=$(echo $line | cut -d "]" -f 2 | cut -d "\"" -f 2 | cut -d " " -f -2)
        # CUT REQUEST TO 30 CHARS
          dreq=${req:0:30}
        # CUSTOM REQUEST INFO IN CASE OF ADMIN PANEL
          if [[ $(echo $req | grep "admin.php" | wc -l) -eq 1 ]]; then
                dreq="\033[01;31mFAV\033[00m"
          fi

        # HTTP CODE
          code=$(echo $line | cut -d "\"" -f 3 | cut -d " " -f 2)
          hcode="\033[01;31m$code\033[00m";
          if [[ "$code" -eq "200" ]]; then
                hcode="\033[01;32m$code\033[00m";
          fi
          if [[ "$code" -eq "404" ]]; then
                hcode="\033[01;34m$code\033[00m";
          fi

        # DISPLAY
          # I DONT WANT TO DISPLAY FAVICON REQUESTS
          if [[ $(echo $req | grep "favicon.ico" | wc -l) -eq 1 ]]; then
                echo -n ""
          else
                echo -e "$day $dtime $hcode $ip $dhost $dreq"
          fi
done < /var/log/apache2/access.log

Obrazek