program parseUAs; {$ifdef FPC} {$mode objfpc} {$endif} {$H+} // interface uses SysUtils, Classes, Dom, XMLRead; type // UserAgent xml file info TUATypes = (uaBrowser, uaBot, uaCrawler, uaSpambot, uaCheck, uaProxy, uaDownload, uaPlugin, uaEngine, uaComponent, uaIgnore, uaOs, uaDistribution, uaCpu, uaDevice); TUAElement = record uaType: TUATypes; token, name, info: AnsiString; end; // UserAgent record TClientTypes = (ctBrowser, ctBot, ctCrawler, ctSpambot, ctCheck, ctProxy, ctDownload); TUserAgent = record ClientType: TClientTypes; ClientName, ClientVersion, ClientInfo, ClientRef, ClientBasedOn, ClientComponent, ClientPlugin, ClientLanguage: AnsiString; EngineName, EngineVersion: AnsiString; SystemName, SystemVersion, SystemDistribution: AnsiString; PlatformName, PlatformArchitecture: AnsiString; bot: boolean; remnant: AnsiString; end; // UserAgent reader class TUserAgentReader = class private FElement: array of TUAElement; public constructor Create(const AUserAgentXml: string); overload; destructor Destroy(); override; function GetUserAgentInfo(const AUserAgentString: string): TUserAgent; function GetEncodedUserAgentInfo(AUserAgentString: string): TUserAgent; end; // implementation // reads the UserAgent xml file into the internal FElement array constructor TUserAgentReader.Create(const AUserAgentXml: string); var Doc: TXMLDocument; el: TDOMNode; i, a: integer; begin inherited Create(); if FileExists(AUserAgentXml) then begin try ReadXMLFile(Doc, AUserAgentXml); el := Doc.DocumentElement; with el.ChildNodes do begin setLength(FElement, Count); for i:=0 to Count-1 do begin for a:=0 to Item[i].Attributes.Length-1 do case Item[i].Attributes[a].NodeName of 'token': FElement[i].token := Item[i].Attributes[a].NodeValue; 'name': FElement[i].name := Item[i].Attributes[a].NodeValue; 'info': FElement[i].info := Item[i].Attributes[a].NodeValue; 'type': case Item[i].Attributes[a].NodeValue of 'browser': FElement[i].uaType := uaBrowser; 'bot': FElement[i].uaType := uaBot; 'spambot': FElement[i].uaType := uaSpambot; 'check': FElement[i].uaType := uaCheck; 'proxy': FElement[i].uaType := uaProxy; 'download': FElement[i].uaType := uaDownload; 'plugin': FElement[i].uaType := uaPlugin; 'engine': FElement[i].uaType := uaEngine; 'component': FElement[i].uaType := uaComponent; 'ignore': FElement[i].uaType := uaIgnore; 'os': FElement[i].uaType := uaOs; 'distribution': FElement[i].uaType := uaDistribution; 'cpu': FElement[i].uaType := uaCpu; 'device': FElement[i].uaType := uaDevice; else writeln(IntToStr(i) + ':' + FElement[i].token + ' type:' + Item[i].Attributes[a].NodeValue); end; end; end; end; finally Doc.Free; end; end; end; destructor TUserAgentReader.Destroy; begin inherited Destroy; end; function TUserAgentReader.GetUserAgentInfo(const AUserAgentString: string): TUserAgent; // language checking const lang: array[0..186] of string = ( 'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', 'ch', 'co', 'cr', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'io', 'is', 'it', 'iu', 'ja', 'jv', 'ka', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', 'ko', 'kr', 'ks', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lg', 'li', 'ln', 'lo', 'lt', 'lu', 'lv', 'mg', 'mh', 'mi', 'mk', 'ml', 'mn', 'mo', 'mr', 'ms', 'mt', 'my', 'na', 'nb', 'nd', 'ne', 'ng', 'nl', 'nn', 'no', 'nr', 'nv', 'ny', 'oc', 'oj', 'om', 'or', 'os', 'pa', 'pi', 'pl', 'ps', 'pt', 'qu', 'rc', 'rm', 'rn', 'ro', 'ru', 'rw', 'sa', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tr', 'ts', 'tt', 'tw', 'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'vo', 'wa', 'wo', 'xh', 'yi', 'yo', 'za', 'zh', 'zu'); country: array[0..241] of string = ( 'AF', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR', 'AM', 'AW', 'AU', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ', 'BM', 'BT', 'BO', 'BA', 'BW', 'BV', 'BR', 'IO', 'VG', 'BN', 'BG', 'BF', 'BI', 'KH', 'CM', 'CA', 'KV', 'KY', 'CF', 'TD', 'CL', 'CX', 'CC', 'CO', 'KM', 'CG', 'CD', 'CK', 'CR', 'CI', 'HR', 'CU', 'CY', 'CZ', 'DK', 'DJ', 'DM', 'DO', 'TP', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE', 'ET', 'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE', 'GH', 'GI', 'GB', 'GR', 'GL', 'GD', 'GP', 'GU', 'GT', 'GN', 'GW', 'GY', 'HT', 'HM', 'VA', 'HN', 'HK', 'HU', 'IS', 'RE', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IL', 'IT', 'JM', 'JP', 'JO', 'KZ', 'KE', 'KI', 'XK', 'KW', 'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'LU', 'MO', 'MK', 'MG', 'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR', 'MU', 'YT', 'MX', 'FM', 'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA', 'NR', 'NP', 'NL', 'AN', 'NC', 'NZ', 'NI', 'NE', 'NG', 'NU', 'NF', 'KP', 'MP', 'NO', 'OM', 'PK', 'PW', 'PS', 'PA', 'PG', 'PY', 'CN', 'PE', 'PH', 'PN', 'PL', 'PT', 'PR', 'QA', 'SM', 'RO', 'RU', 'RW', 'KN', 'LC', 'PM', 'VC', 'WS', 'ST', 'SA', 'SN', 'RS', 'SC', 'SL', 'SG', 'SK', 'SI', 'SB', 'SO', 'ZA', 'GS', 'KR', 'ES', 'LK', 'SH', 'SD', 'SR', 'SJ', 'SZ', 'SE', 'CH', 'SY', 'TW', 'TJ', 'TZ', 'TH', 'TG', 'TK', 'TO', 'TT', 'TN', 'TR', 'TM', 'TC', 'TV', 'UG', 'UA', 'AE', 'UK', 'US', 'UY', 'UM', 'UZ', 'VU', 'VE', 'VN', 'VI', 'WF', 'EH', 'YE', 'ZM', 'ZW'); function isLang(AString: string): boolean; var i: integer; begin for i := Low(lang) to High(lang) do if lang[i] = AString then exit(true); result := false; end; function isCountry(AString: string): boolean; var i: integer; begin if (AString[1] in ['A'..'Z']) then begin for i := Low(country) to High(country) do if country[i] = AString then exit(true) end else for i := Low(country) to High(country) do if lowercase(country[i]) = AString then exit(true); result := false; end; function isLangCountry(AString: string): boolean; begin if (length(AString) = 2) or ((length(AString) = 5) and (AString[3] in ['-', '_'])) then begin if (length(AString) = 5) and not isCountry(copy(AString, 4, 2)) then exit(false); result := isLang(copy(AString, 1, 2)); end else result := false; end; var bVersion, bRv, bName: boolean; sName, sVersion: string; procedure checkPart(const last: boolean = false); var bFound: boolean; i, l: integer; s: string; begin sName := trim(sName); sVersion := trim(sVersion); if sName <> '' then with result do begin bFound := false; for i := Low(FElement) to High(FElement) do begin s := FElement[i].token; l := length(s); if (s[l] in ['/', '*']) and (pos(copy(s, 1, l-1), sName)=1) then begin // "/" = a Version is following // "*" = the following is not of interest bFound := true; if (sVersion = '') and (s[l] = '/') then begin sVersion := copy(sName, l); if pos(' ', sVersion) > 0 then sVersion := copy(sVersion, 1, pos(' ', sVersion)-1); end; break; end else if (s = sName) then begin // exact match bFound := true; break; end; end; if bFound then begin // a match is found in storage case FElement[i].uaType of uaBrowser: // it's a browser if not bName then begin ClientType := ctBrowser; ClientName := ClientName + BoolToStr(ClientName<>'', ' ', '') + FElement[i].name; if (FElement[i].name <> 'Mobile') then bName := true; if not bVersion then ClientVersion := sVersion; ClientInfo := ClientInfo + BoolToStr(ClientInfo<>'', ' ', '') + FElement[i].info; end; uaBot, uaCrawler, uaSpambot, uaCheck, uaProxy, uaDownload: // it's some kind of bot begin case FElement[i].uaType of uaBot: ClientType := ctBot; uaCrawler: ClientType := ctCrawler; uaSpambot: ClientType := ctSpambot; uaCheck: ClientType := ctCheck; uaProxy: ClientType := ctProxy; uaDownload: ClientType := ctDownload; end; ClientName := FElement[i].name; bName := true; if sVersion <> '' then ClientVersion := sVersion; bot := true; end; uaEngine: // it's a engine if ((sVersion <> '') or bRv) then begin EngineName := FElement[i].name; if not bRv then EngineVersion := sVersion; end; uaPlugin: // it's some kind of plugin ClientPlugin := ClientPlugin + BoolToStr(ClientPlugin<>'' , ', ', '') + FElement[i].name + BooltoStr(sVersion<>'', '/' + sVersion, ''); uaComponent: // it's some kind of component ClientComponent := ClientComponent + BoolToStr(ClientComponent<>'' , ', ', '') + FElement[i].name + BooltoStr(sVersion<>'', '/' + sVersion, ''); uaOs: // it's a operating system if (pos('Windows', FElement[i].name)=1) then begin SystemName := 'Windows'; SystemVersion := copy(FElement[i].name, 9); end else begin if (FElement[i].name = 'Linux') and (sVersion <> '') then begin PlatformArchitecture := sVersion; sVersion := ''; end; SystemName := BoolToStr(SystemName<>'' , SystemName + ' ', '') + FElement[i].name; if sVersion <> '' then begin if pos('_', sVersion) > 0 then sVersion := StringReplace(sVersion, '_', '.', [rfReplaceAll]); SystemVersion := sVersion; end; end; uaDistribution: // it's a operating system distribution SystemDistribution := FElement[i].name + BooltoStr(sVersion<>'', '/' + sVersion, ''); uaCpu: // it's some kind of processor PlatformArchitecture := FElement[i].name + BooltoStr(sVersion<>'', '/' + sVersion, ''); uaDevice: // it's some device info PlatformName := FElement[i].name; end end else if sName = 'rv' then begin EngineVersion := sVersion; bRv := true; end else if (sName = 'Version') and (sVersion <> '') then begin if not bot then begin ClientVersion := sVersion; bVersion := true; end; end else if (pos('http:', sName)>0) or (pos('https:', sName)>0) or (pos('@', sName)>0) then ClientRef := sName else if pos('Win64', sName)>0 then SystemVersion := SystemVersion + ' 64bit' else if pos('WOW64', sName)>0 then begin SystemVersion := SystemVersion + ' 32bit'; PlatformArchitecture := 'x86_64'; end else if isLangCountry(sName) then ClientLanguage := sName { else if last and (ClientName = '') then begin // catch bots ClientName := sName; if sVersion <> '' then ClientVersion := sVersion; bot := true; end } else if (pos('crawler', lowercase(sName))>0) or (pos('spider', lowercase(sName))>0) or (pos('bot', lowercase(sName))>0) or (pos('Bot', lowercase(sName))>0) then remnant := remnant + BoolToStr(remnant<>'', ' ','') + '|'+'' + '|' else remnant := remnant + BoolToStr(remnant<>'', ' ','') + '|'+sName+BoolToStr(sVersion<>'', '/'+sVersion,'') + '|'; end; sName := ''; sVersion := ''; end; type TStates = (STATE_UNDEFINED, STATE_NAME, STATE_VERSION, STATE_COMMENT, STATE_COMMENT_VERSION, STATE_LANGUAGE); var state, fallbackState: TStates; i, len: integer; c: char; begin Finalize(result); FillChar(result, SizeOf(result), 0); result.bot := false; // check if it can be a user-agent if (AUserAgentString[1] in ['A'..'Z', 'a'..'z', '[']) then begin // init start values bVersion := false; bRv := false; bName := false; sName := ''; sVersion := ''; state := STATE_UNDEFINED; fallbackState := STATE_UNDEFINED; len := Length(AUserAgentString); for i:=1 to len do begin c := AUserAgentString[i]; // the state is undefined after each part and at the beginning if (state = STATE_UNDEFINED) then begin if (c = '(') then // start of comment state := STATE_COMMENT else if (c = '[') then // start of old language part state := STATE_LANGUAGE else if not (c in [' ', '+', ')', ';']) then // ignores this at the beginnig of a new part begin // start of a name part state := STATE_NAME; sName := c; end end else if (state = STATE_NAME) then begin if (c = '/') then // start of the version part state := STATE_VERSION else if (c = '+') then begin // something new starts checkPart(); state := STATE_UNDEFINED end else if (c = '(') then begin // start of comment checkPart(); state := STATE_COMMENT end else // go on writing the name sName := sName + c; end else if (state = STATE_VERSION) then begin if (c = ' ') then begin // something new starts state := STATE_UNDEFINED; checkPart(); end else if (c = '/') then begin // the first slash was part of the name (fix) sName := sName + '/' + sVersion; sVersion := ''; end else // go on writing the version sVersion := sVersion + c; end else if (state in [STATE_COMMENT, STATE_COMMENT_VERSION]) then begin if (c = ')') then begin // the comment ends, something new starts checkPart(); state := fallbackState; end else if (c = '(') then begin // there is a comment inner comment checkPart(); fallbackState := STATE_COMMENT; state := STATE_COMMENT; end else if (c in [';', ',']) then begin // next comment checkPart(); state := STATE_COMMENT; end else if (c in ['+', ' ']) and (sName = '') then // ignore + and space of the beginning of a comment else if (c = '/') and (pos('http:', sName)=0) and (pos('https:', sName)=0) then // start with version in comment part state := STATE_COMMENT_VERSION else if (c = ':') and (sName = 'rv') then // rv is a special case it is separated by a colon // start with version in comment part state := STATE_COMMENT_VERSION else begin if (state = STATE_COMMENT_VERSION) then // go on writing the comment version sVersion := sVersion + c else // go on writing the comment name sName := sName + c; end; end else if (state = STATE_LANGUAGE) then begin if (c = ']') then begin checkPart(); state := STATE_UNDEFINED end else sName := sName + c; end; end; // analyse the last part checkPart(true); end; if (result.ClientName = '') then result.ClientName := 'unknown'; end; function TUserAgentReader.GetEncodedUserAgentInfo(AUserAgentString: string): TUserAgent; begin AUserAgentString := StringReplace(AUserAgentString, '++', ' '#9, [rfReplaceAll]); AUserAgentString := StringReplace(AUserAgentString, '+(+http', ' ('#9'http', [rfReplaceAll]); AUserAgentString := StringReplace(AUserAgentString, '+', ' ', [rfReplaceAll]); AUserAgentString := StringReplace(AUserAgentString, #9, '+', [rfReplaceAll]); result := GetUserAgentInfo(AUserAgentString); end; function Dump(const AUserAgent: TUserAgent): string; var s: string; begin with AUserAgent do begin result := 'client' + BoolToStr(bot, ' (bot)', '') + ': ' + ClientName + BoolToStr(ClientVersion <> '', ' ' + ClientVersion, '') + BoolToStr(ClientInfo <> '', ' (' + ClientInfo + ')', '') + #$0A; s := ''; if (ClientRef <> '') then s := s + BoolToStr(s = '', ' ', '; ') + 'ref: ' + ClientRef; if (ClientBasedOn <> '') then s := s + BoolToStr(s = '', ' ', '; ') + 'based on: ' + ClientBasedOn; if (ClientComponent <> '') then s := s + BoolToStr(s = '', ' ', '; ') + 'component: ' + ClientComponent; if (ClientPlugin <> '') then s := s + BoolToStr(s = '', ' ', '; ') + 'plugin: ' + ClientPlugin; if (ClientLanguage <> '') then s := s + BoolToStr(s = '', ' ', '; ') + 'language: ' + ClientLanguage; if s <> '' then result := result + s + #$0A; if (EngineName <> '') then result := result + 'engine: ' + EngineName + BoolToStr(EngineVersion <> '', ' ' + EngineVersion, '') + #$0A; if (PlatformName <> '') or (PlatformArchitecture <> '') then result := result + 'platform:' + BoolToStr(PlatformName <> '', ' ' + PlatformName, '') + BoolToStr(PlatformArchitecture <> '', ' ' + PlatformArchitecture, '') + #$0A; if (SystemDistribution<>'') or (SystemName <> '') or (SystemVersion <> '') then result := result + 'system:' + BoolToStr(SystemDistribution <> '', ' ' + SystemDistribution, '') + BoolToStr(SystemName <> '', ' ' + SystemName, '') + BoolToStr(SystemVersion <> '', ' ' + SystemVersion, '') + #$0A; if (remnant <> '') then result := result + 'remnant: ' + remnant + #$0A; end; end; var i: integer; sUserAgent: string; UserAgent: TUserAgent; uaReader: TUserAgentReader; f: TextFile; StartTime, EndTime: QWord; begin StartTime := GetTickCount64(); uaReader := TUserAgentReader.Create(ExtractFilePath(ParamStr(0)) + 'ua.xml'); with TStringList.Create() do try LoadFromFile(ExtractFilePath(ParamStr(0)) + 'useragents.txt'); AssignFile(f, ExtractFilePath(ParamStr(0)) + 'useragents~my.txt'); rewrite(f); for i:=0 to Count-1 do begin sUserAgent := Strings[i]; UserAgent := uaReader.GetUserAgentInfo(sUserAgent); writeln(f, '* ' + sUserAgent); writeln(f, Dump(UserAgent)); end; CloseFile(f); finally free; end; uaReader.Free; EndTime := GetTickCount64(); WriteLn('TickCount: ' + IntToStr(EndTime - StartTime) + ' ms'); ReadLn(); end.