• 🌙 Community Spirit

    Ramadan Mubarak! To honor this month, Crax has paused NSFW categories. Wishing you peace and growth!

Email Id Extractor Project from sites in Scrapy Python (1 Viewer)

Currently reading:
 Email Id Extractor Project from sites in Scrapy Python (1 Viewer)

Recently searched:

m3uZael

Member
LV
0
Joined
Jan 25, 2024
Threads
11
Likes
0
Awards
2
Credits
965©
Cash
0$
Python:
# web scraping framework
import scrapy

# for regular expression
import re

# for selenium request
from scrapy_selenium import SeleniumRequest

# for link extraction
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

class EmailtrackSpider(scrapy.Spider):
    # name of spider
    name = 'emailtrack'

    # to have unique email ids
    uniqueemail = set()

    
    # and parse function is called
    def start_requests(self):
        yield SeleniumRequest(
            url="https://www.google.com",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )

    def parse(self, response):
            # this helps to get all links from source code
            links = LxmlLinkExtractor(allow=()).extract_links(response)

            # Finallinks contains links url
            Finallinks = [str(link.url) for link in links]

            # links list for url that may have email ids
            links = []

            # filtering and storing only needed url in links list
            # pages that are about us and contact us are the ones that have email ids
            for link in Finallinks:
                if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                    links.append(link)

            # current page url also added because few sites have email ids on there main page
            links.append(str(response.url))



            # parse_link function is called for extracting email ids
            l = links[0]
            links.pop(0)

            # meta helps to transfer links list from parse to parse_link
            yield SeleniumRequest(
                url=l,
                wait_time=3,
                screenshot=True,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )


    def parse_link(self, response):

        # response.meta['links'] this helps to get links list
        links = response.meta['links']
        flag = 0

        # links that contains following bad words are discarded
        bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']

        for word in bad_words:
            # if any bad word is found in the current page url
            # flag is assigned to 1
            if word in str(response.url):
                flag = 1
                break

        # if flag is 1 then no need to get email from
        # that url/page
        if (flag != 1):
            html_text = str(response.text)
            # regular expression used for email id
            email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
            # set of email_list to get unique
            email_list = set(email_list)
            if (len(email_list) != 0):
                for i in email_list:
                    # adding email ids to final uniqueemail
                    self.uniqueemail.add(i)

        # parse_link function is called till
        # if condition satisfy
        # else move to parsed function
        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            yield SeleniumRequest(
                url=l,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
        else:
            yield SeleniumRequest(
                url=response.url,
                callback=self.parsed,
                dont_filter=True
            )

    def parsed(self, response):
        # emails list of uniqueemail set
        emails = list(self.uniqueemail)
        finalemail = []

        for email in emails:
            # avoid garbage value by using '.in' and '.com'
            # and append email ids to finalemail
            if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):

                finalemail.append(email)

        # final unique email ids from geeksforgeeks site
        print('\n'*2)
        print("Emails scraped", finalemail)
        print('\n'*2)
 

Create an account or login to comment

You must be a member in order to leave a comment

Create account

Create an account on our community. It's easy!

Log in

Already have an account? Log in here.

Tips
Recently searched:

Similar threads

Users who are viewing this thread

Top Bottom