UnderwareDesign
December 11, 2017, 01:23:05 AM *
News: G2D - 2D OpenGL library V007 for PlayBASIC V1.64P - Including Programmable Particles   (Released 22nd,Nov 2014)
   Home    
Pages: [1]
 
Author Topic: Compute List Of Unique Email Addresses  (Read 137 times)
Member
Development Team


WWW
« on: September 24, 2017, 10:55:24 AM »


   Compute List Of Unique Email Addresses

     This code  skim through an input file looking for rows with email addresses in them,  in the file I have,  I only needed to briefly parse the rows, as they look a bit like this

    "0000  WORD  EMAIL@ADDRESS  SOME OTHER NUMBERS"

    so it loads the file to a string, splits it to an array, then looks as the rows and splits them to an array and builds a list of all the email addresses in the file.   It doesn't really check if it's a real address, it assume any string with an @ followed by a dot after it, is likely to be an email address (it might not be!)..  But it's good enough for my MY needs.. If you need it to do something else, then YOU write it yourself...  Smiley

     Once the file has been processed,  it then tries to build a list of unique emails from raw email list then dump it out to a save file.. So you end up with a list of email domains.
  

     NOTE:  This code needs the location of the file with the email addresses in it to work.  

PlayBASIC Code:
   //----------------------------------------------------------------------
   //---[ FILE NAMES ]-----------------------------------------------------
   //----------------------------------------------------------------------

   EmailList_FileName$   ="RAW-Email-LIst.txt"
   Results_SaveName$      ="Unique-Domains.txt"

   //----------------------------------------------------------------------

   if fileexist(EmailList_FileName$)
      Dim EmailList$(0)

      LoadAndParseEmails(EmailList_FileName$   ,EmailList$())

      count=Compute_List_Of_Common_Emails(EmailList$())
      print count
      
      if Fileexist(SaveName$) then deletefile Results_SaveName$
   
      fh=writenewfile(Results_SaveName$)
      if fh
   
         for lp =0 to Count
            e$=EmailList$(lp)
            print e$
            writestring fh,e$ 
         next

         Closefile FH
      endif   

   else
         print "Can't find input file"
         print EmailList_FileName$   
   endif   
   
   sync
   waitkey



function LoadAndParseEmails(Filename$,Email$())
   
   if fileexist(filename$)
      size=filesize(filename$)
      fh=readnewfile(filename$)
      
      if fh
          s$=readchr$(fh,size)       
          closefile fh

          // strip spaces   
            s$=replace$(s$,chr$(9)," ")
            do   
               s$=replace$(s$,"  "," ")
            loop instring(s$,"  ")<1

            s$=replace$(s$,chr$(13),"")

            Dim Output$(Size/2)
            LinesInFile = SplitToArray(s$,chr$(10), Output$())
            
            dim row$(255)
            
            
            DIM Email$(LinesInFile)
            
            for lp =0 to LInesInfile
               s$=trim$(Output$(lp))
               
               // look for line bigger than say 5 characters as 
               if len(s$)>5
                  // cut the row up, then skim through and look for anything
                  // with the @ sign in it, so we assume it's an email address..
                  // it might not be but i DONT CARE ! :) 
                  TokensInRow=SplitToArray(s$," ",row$())
                  For TokensLp=0 to TokensInRow-1
                     Token$=row$(TokensLP)
                     atpos=instring(Token$,"@")
                     if atpos>1
                           if instring(Token$,".",atpos)>atpos   
                              Email$(EmailCount)=Token$                              
                              EmailCount++
                           endif
                     endif
                  next
               endif
            next      
   
            ReDim Email$(EmailCount)
            undim row$()
            undim Output$()
      endif
      
   endif
   
   
Endfunction





function Compute_List_Of_Common_Emails(Emails$())
   
   Size=$10000
   DIm HashTable(Size)
   Dim HashEmail$(Size)
   For lp =0 to GetArrayElements(EmailList$())-1

         Email$=Emails$(lp)   
         EmailDomain$=""
         Pos=instring(Email$,"@")
         if Pos
            EmailDomain$=CutLeft$(Email$,Pos-1)            

            HASH=HashString(EmailDomain$)
            if HashTable(HASH)=0
                  HashEmail$(Hash) = EmailDomain$
            else               
                  HashEmail$=HashEmail$(Hash)
                  if instring(HashEmail$,EmailDomain$)<1
                        HashEmail$(Hash)+=" "+EmailDomain$   
                  endif
            endif

            HashTable(HASH)++   

         endif
               
   next
   
   
   EmailCount = 0
   
   Dim Row$(255)
   
   
   // build list of email domains
    for hash =0 to $ffff
       count=HashTable(HASH)
       if count
          TheseEmails$=HashEmail$(Hash)
//          print "Hash:$"+right$(hex$(Hash),4)+" Count "+digits$(count,4)+"   "+TheseEmails$
         EmailsOfSameHash = SplittoArray(TheseEmails$," ",Row$())
         for lp=0 to EmailsOfSameHash-1
               Emails$(EmailCount)=row$(lp)
               EmailCount++                              

Login required to view complete source code

Logged

Pages: [1]
 
 
Jump to:  

Powered by MySQL Powered by PHP Powered by SMF 1.1.13 | SMF © 2006-2009, Simple Machines LLC | Privacy Policy Valid XHTML 1.0! Valid CSS!