Compute List Of Unique Email Addresses

Started by kevin, September 24, 2017, 10:55:24 AM

Previous topic - Next topic

kevin

   Compute List Of Unique Email Addresses

    This code  skim through an input file looking for rows with email addresses in them,  in the file I have,  I only needed to briefly parse the rows, as they look a bit like this

    "0000  WORD  EMAIL@ADDRESS  SOME OTHER NUMBERS"

   so it loads the file to a string, splits it to an array, then looks as the rows and splits them to an array and builds a list of all the email addresses in the file.   It doesn't really check if it's a real address, it assume any string with an @ followed by a dot after it, is likely to be an email address (it might not be!)..  But it's good enough for my MY needs.. If you need it to do something else, then YOU write it yourself...  :)

    Once the file has been processed,  it then tries to build a list of unique emails from raw email list then dump it out to a save file.. So you end up with a list of email domains.
 

    NOTE:  This code needs the location of the file with the email addresses in it to work.  

PlayBASIC Code: [Select]
   //----------------------------------------------------------------------
//---[ FILE NAMES ]-----------------------------------------------------
//----------------------------------------------------------------------

EmailList_FileName$ ="RAW-Email-LIst.txt"
Results_SaveName$ ="Unique-Domains.txt"

//----------------------------------------------------------------------

if fileexist(EmailList_FileName$)
Dim EmailList$(0)

LoadAndParseEmails(EmailList_FileName$ ,EmailList$())

count=Compute_List_Of_Common_Emails(EmailList$())
print count

if Fileexist(SaveName$) then deletefile Results_SaveName$

fh=writenewfile(Results_SaveName$)
if fh

for lp =0 to Count
e$=EmailList$(lp)
print e$
writestring fh,e$
next

Closefile FH
endif

else
print "Can't find input file"
print EmailList_FileName$
endif

sync
waitkey



function LoadAndParseEmails(Filename$,Email$())

if fileexist(filename$)
size=filesize(filename$)
fh=readnewfile(filename$)

if fh
s$=readchr$(fh,size)
closefile fh

// strip spaces
s$=replace$(s$,chr$(9)," ")
do
s$=replace$(s$," "," ")
loop instring(s$," ")<1

s$=replace$(s$,chr$(13),"")

Dim Output$(Size/2)
LinesInFile = SplitToArray(s$,chr$(10), Output$())

dim row$(255)


DIM Email$(LinesInFile)

for lp =0 to LInesInfile
s$=trim$(Output$(lp))

// look for line bigger than say 5 characters as
if len(s$)>5
// cut the row up, then skim through and look for anything
// with the @ sign in it, so we assume it's an email address..
// it might not be but i DONT CARE ! :)
TokensInRow=SplitToArray(s$," ",row$())
For TokensLp=0 to TokensInRow-1
Token$=row$(TokensLP)
atpos=instring(Token$,"@")
if atpos>1
if instring(Token$,".",atpos)>atpos
Email$(EmailCount)=Token$
EmailCount++
endif
endif
next
endif
next

ReDim Email$(EmailCount)
undim row$()
undim Output$()
endif

endif


Endfunction





function Compute_List_Of_Common_Emails(Emails$())

Size=$10000
DIm HashTable(Size)
Dim HashEmail$(Size)
For lp =0 to GetArrayElements(EmailList$())-1

Email$=Emails$(lp)
EmailDomain$=""
Pos=instring(Email$,"@")
if Pos
EmailDomain$=CutLeft$(Email$,Pos-1)

HASH=HashString(EmailDomain$)
if HashTable(HASH)=0
HashEmail$(Hash) = EmailDomain$
else
HashEmail$=HashEmail$(Hash)
if instring(HashEmail$,EmailDomain$)<1
HashEmail$(Hash)+=" "+EmailDomain$
endif
endif

HashTable(HASH)++

endif

next


EmailCount = 0

Dim Row$(255)


// build list of email domains
for hash =0 to $ffff
count=HashTable(HASH)
if count
TheseEmails$=HashEmail$(Hash)
// print "Hash:$"+right$(hex$(Hash),4)+" Count "+digits$(count,4)+" "+TheseEmails$
EmailsOfSameHash = SplittoArray(TheseEmails$," ",Row$())
for lp=0 to EmailsOfSameHash-1
Emails$(EmailCount)=row$(lp)
EmailCount++
Login required to view complete source code