Compute List Of Unique Email Addresses
This code skim through an input file looking for rows with email addresses in them, in the file I have, I only needed to briefly parse the rows, as they look a bit like this
"0000 WORD EMAIL@ADDRESS SOME OTHER NUMBERS"
so it loads the file to a string, splits it to an array, then looks as the rows and splits them to an array and builds a list of all the email addresses in the file. It doesn't really check if it's a real address, it assume any string with an @ followed by a dot after it, is likely to be an email address (it might not be!).. But it's good enough for my MY needs.. If you need it to do something else, then YOU write it yourself... :)
Once the file has been processed, it then tries to build a list of unique emails from raw email list then dump it out to a save file.. So you end up with a list of email domains.
NOTE: This code needs the location of the file with the email addresses in it to work.
[pbcode]
//----------------------------------------------------------------------
//---[ FILE NAMES ]-----------------------------------------------------
//----------------------------------------------------------------------
EmailList_FileName$ ="RAW-Email-LIst.txt"
Results_SaveName$ ="Unique-Domains.txt"
//----------------------------------------------------------------------
if fileexist(EmailList_FileName$)
Dim EmailList$(0)
LoadAndParseEmails(EmailList_FileName$ ,EmailList$())
count=Compute_List_Of_Common_Emails(EmailList$())
print count
if Fileexist(SaveName$) then deletefile Results_SaveName$
fh=writenewfile(Results_SaveName$)
if fh
for lp =0 to Count
e$=EmailList$(lp)
print e$
writestring fh,e$
next
Closefile FH
endif
else
print "Can't find input file"
print EmailList_FileName$
endif
sync
waitkey
function LoadAndParseEmails(Filename$,Email$())
if fileexist(filename$)
size=filesize(filename$)
fh=readnewfile(filename$)
if fh
s$=readchr$(fh,size)
closefile fh
// strip spaces
s$=replace$(s$,chr$(9)," ")
do
s$=replace$(s$," "," ")
loop instring(s$," ")<1
s$=replace$(s$,chr$(13),"")
Dim Output$(Size/2)
LinesInFile = SplitToArray(s$,chr$(10), Output$())
dim row$(255)
DIM Email$(LinesInFile)
for lp =0 to LInesInfile
s$=trim$(Output$(lp))
// look for line bigger than say 5 characters as
if len(s$)>5
// cut the row up, then skim through and look for anything
// with the @ sign in it, so we assume it's an email address..
// it might not be but i DONT CARE ! :)
TokensInRow=SplitToArray(s$," ",row$())
For TokensLp=0 to TokensInRow-1
Token$=row$(TokensLP)
atpos=instring(Token$,"@")
if atpos>1
if instring(Token$,".",atpos)>atpos
Email$(EmailCount)=Token$
EmailCount++
endif
endif
next
endif
next
ReDim Email$(EmailCount)
undim row$()
undim Output$()
endif
endif
Endfunction
function Compute_List_Of_Common_Emails(Emails$())
Size=$10000
DIm HashTable(Size)
Dim HashEmail$(Size)
For lp =0 to GetArrayElements(EmailList$())-1
Email$=Emails$(lp)
EmailDomain$=""
Pos=instring(Email$,"@")
if Pos
EmailDomain$=CutLeft$(Email$,Pos-1)
HASH=HashString(EmailDomain$)
if HashTable(HASH)=0
HashEmail$(Hash) = EmailDomain$
else
HashEmail$=HashEmail$(Hash)
if instring(HashEmail$,EmailDomain$)<1
HashEmail$(Hash)+=" "+EmailDomain$
endif
endif
HashTable(HASH)++
endif
next
EmailCount = 0
Dim Row$(255)
// build list of email domains
for hash =0 to $ffff
count=HashTable(HASH)
if count
TheseEmails$=HashEmail$(Hash)
// print "Hash:$"+right$(hex$(Hash),4)+" Count "+digits$(count,4)+" "+TheseEmails$
EmailsOfSameHash = SplittoArray(TheseEmails$," ",Row$())
for lp=0 to EmailsOfSameHash-1
Emails$(EmailCount)=row$(lp)
EmailCount++
next
endif
next
if EmailCount>0
EmailCount--
redim Emails$(EmailCount)
else
dim emails$(0)
endif
EndFunction EmailCount
Function HashString(S$)
Hash = Len(s$)
for lp =1 to Len(s$)
ThisCHR=mid(s$,lp)
Hash = rol16(hash xor (ThisCHR*lp),1)
next
hash = hash and $ffff
EndFunction Hash
[/pbcode]