UnderwareDESIGN

PlayBASIC => Resources => Source Codes => Topic started by: kevin on September 24, 2017, 10:55:24 AM

Title: Compute List Of Unique Email Addresses
Post by: kevin on September 24, 2017, 10:55:24 AM
   Compute List Of Unique Email Addresses

    This code  skim through an input file looking for rows with email addresses in them,  in the file I have,  I only needed to briefly parse the rows, as they look a bit like this

    "0000  WORD  EMAIL@ADDRESS  SOME OTHER NUMBERS"

   so it loads the file to a string, splits it to an array, then looks as the rows and splits them to an array and builds a list of all the email addresses in the file.   It doesn't really check if it's a real address, it assume any string with an @ followed by a dot after it, is likely to be an email address (it might not be!)..  But it's good enough for my MY needs.. If you need it to do something else, then YOU write it yourself...  :)

    Once the file has been processed,  it then tries to build a list of unique emails from raw email list then dump it out to a save file.. So you end up with a list of email domains.
 

    NOTE:  This code needs the location of the file with the email addresses in it to work.  

[pbcode]

   //----------------------------------------------------------------------
   //---[ FILE NAMES ]-----------------------------------------------------
   //----------------------------------------------------------------------

   EmailList_FileName$   ="RAW-Email-LIst.txt"
   Results_SaveName$      ="Unique-Domains.txt"

   //----------------------------------------------------------------------

   if fileexist(EmailList_FileName$)
      Dim EmailList$(0)

      LoadAndParseEmails(EmailList_FileName$   ,EmailList$())

      count=Compute_List_Of_Common_Emails(EmailList$())
      print count
      
      if Fileexist(SaveName$) then deletefile Results_SaveName$
   
      fh=writenewfile(Results_SaveName$)
      if fh
   
         for lp =0 to Count
            e$=EmailList$(lp)
            print e$
            writestring fh,e$
         next

         Closefile FH
      endif   

   else
         print "Can't find input file"
         print EmailList_FileName$   
   endif   
   
   sync
   waitkey



function LoadAndParseEmails(Filename$,Email$())
   
   if fileexist(filename$)
      size=filesize(filename$)
      fh=readnewfile(filename$)
      
      if fh
          s$=readchr$(fh,size)       
          closefile fh

          // strip spaces   
            s$=replace$(s$,chr$(9)," ")
            do   
               s$=replace$(s$,"  "," ")
            loop instring(s$,"  ")<1

            s$=replace$(s$,chr$(13),"")

            Dim Output$(Size/2)
            LinesInFile = SplitToArray(s$,chr$(10), Output$())
            
            dim row$(255)
            
            
            DIM Email$(LinesInFile)
            
            for lp =0 to LInesInfile
               s$=trim$(Output$(lp))
               
               // look for line bigger than say 5 characters as
               if len(s$)>5
                  // cut the row up, then skim through and look for anything
                  // with the @ sign in it, so we assume it's an email address..
                  // it might not be but i DONT CARE ! :)
                  TokensInRow=SplitToArray(s$," ",row$())
                  For TokensLp=0 to TokensInRow-1
                     Token$=row$(TokensLP)
                     atpos=instring(Token$,"@")
                     if atpos>1
                           if instring(Token$,".",atpos)>atpos   
                              Email$(EmailCount)=Token$                              
                              EmailCount++
                           endif
                     endif
                  next
               endif
            next      
   
            ReDim Email$(EmailCount)
            undim row$()
            undim Output$()
      endif
      
   endif
   
   
Endfunction





function Compute_List_Of_Common_Emails(Emails$())
   
   Size=$10000
   DIm HashTable(Size)
   Dim HashEmail$(Size)
   For lp =0 to GetArrayElements(EmailList$())-1

         Email$=Emails$(lp)   
         EmailDomain$=""
         Pos=instring(Email$,"@")
         if Pos
            EmailDomain$=CutLeft$(Email$,Pos-1)            

            HASH=HashString(EmailDomain$)
            if HashTable(HASH)=0
                  HashEmail$(Hash) = EmailDomain$
            else               
                  HashEmail$=HashEmail$(Hash)
                  if instring(HashEmail$,EmailDomain$)<1
                        HashEmail$(Hash)+=" "+EmailDomain$   
                  endif
            endif

            HashTable(HASH)++   

         endif
               
   next
   
   
   EmailCount = 0
   
   Dim Row$(255)
   
   
   // build list of email domains
    for hash =0 to $ffff
       count=HashTable(HASH)
      if count
          TheseEmails$=HashEmail$(Hash)
//          print "Hash:$"+right$(hex$(Hash),4)+" Count "+digits$(count,4)+"   "+TheseEmails$
         EmailsOfSameHash = SplittoArray(TheseEmails$," ",Row$())
         for lp=0 to EmailsOfSameHash-1
               Emails$(EmailCount)=row$(lp)
               EmailCount++                              
         next

      endif
   next
      
   if EmailCount>0
      EmailCount--
      redim Emails$(EmailCount)
   else
      dim emails$(0)
   
   endif      

EndFunction EmailCount


Function HashString(S$)
      Hash = Len(s$)
      for lp =1 to Len(s$)
            ThisCHR=mid(s$,lp)
          Hash = rol16(hash xor (ThisCHR*lp),1)
      next
      hash = hash and $ffff   
EndFunction Hash

[/pbcode]