Code:
#use it like this:
# ruby LogParser.rb logFile.log
#!/usr/local/bin/ruby
require 'date'
class LogEntry
attr_reader :host, :user, :auth, :date, :referrer, :ua, :rcode, :nbytes, :url
@@epat = Regexp.new('^(\S+) (\S+) (\S+) \[(.+)\] "(.+)" (\d{3}) (\d+|-) "(.*?)" "(.*?)"$');
@@rpat = Regexp.new('\A(\S+)\s+(\S+)\s+(\S+)\Z');
def initialize(line)
@host, @user, @auth, ds, request, code, bs, @referrer, @ua = @@epat.match(line).captures
@date = DateTime.strptime(ds, "%d/%b/%Y:%H:%M:%S %z");
@rcode = Integer(code)
@nbytes = (bs == "-" ? 0 : Integer(bs))
@method, @url, @proto = @@rpat.match(request).captures
end
def to_s()
"LogEntry[host:" + host + ", date:" + date.to_s + ", referrer:" + referrer +
", url:" + url + ", ua:" + ua + ", user:" + user + ", auth:" + auth +
", rcode:" + rcode.to_s + ", nbytes:" + nbytes.to_s + "]";
end
end
puts "Usage:: [ruby] LogParser.rb <inpfile>" if ARGV.length < 1
inpfile = File.open(ARGV[0])
t1 = Time.now
nlines = 0
start_date = end_date = nil
le = nil
hosts = Hash.new(0)
urls = Hash.new(0)
referrers = Hash.new(0)
uastrings = Hash.new(0)
st = Time.now
while line = inpfile.gets
begin
le = LogEntry.new(line)
start_date = le.date if !start_date
hosts[le.host] += 1;
urls[le.url] += 1;
referrers[le.referrer] += 1;
uastrings[le.ua] += 1;
rescue
print "Log entry parse failed at line: ", (nlines + 1), ", error: ", $!, "\n"
print "LINE: ", line, "\n"
end
nlines += 1
if nlines % 4096 == 0
et = Time.now
puts "processed " + nlines.to_s + " lines ... (" + (et - st).to_s + " seconds)"
st = et
end
end
end_date = le.date
t2 = Time.now
printf("start_date:%s, end_date:%s\n", start_date.to_s, end_date.to_s);
printf("lines:%d, hosts:%d, urls:%d, referrers:%d, uastrings:%d\n",
nlines, hosts.length, urls.length, referrers.length, uastrings.length);
print "Processing time : ", (t2 - t1).to_s, " seconds\n"
# Do the sorting and display of top 20
def print_top20(label, h)
arr = h.sort { |a,b| b[1] <=> a[1] }
print "------------ " + label + " -------------\n"
for i in 0...20
printf("%2d. %s (%d)\n", i, arr[i][0], arr[i][1]) rescue nil
end
puts
end
t1 = Time.now
print_top20("Top 20 Hosts", hosts)
print_top20("Top 20 URLs", urls)
print_top20("Top 20 Referrers", referrers)
print_top20("Top 20 UA Strings", uastrings)
t2 = Time.now
print "Sort and Display time: ", (t2 - t1).to_s, " seconds\n"
this code is used to parse the apache log file. When i apply it with small file, like thousands of log entries, it is working properly. However, when i attempt to parse a real file, for about 10M with tens of thousands of lines, it no long works, returning error message for each line, for example:
Bookmarks