DEV Community

Cover image for Web Scraping GitHub Page
Anuoluwapo Balogun
Anuoluwapo Balogun

Posted on

Web Scraping GitHub Page

Use Request Library to download webpages

import requests
Enter fullscreen mode Exit fullscreen mode
topics_url = 'https://github.com/topics'
Enter fullscreen mode Exit fullscreen mode
response = requests.get(topics_url)
Enter fullscreen mode Exit fullscreen mode
response.status_code
Enter fullscreen mode Exit fullscreen mode

200

len(response.text)
Enter fullscreen mode Exit fullscreen mode

177177

page_contents = response.text
Enter fullscreen mode Exit fullscreen mode
page_contents[:1000]
Enter fullscreen mode Exit fullscreen mode

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark">\n <head>\n <meta charset="utf-8">\n <link rel="dns-prefetch" href="https://github.githubassets.com">\n <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n <link rel="dns-prefetch"....

Write page as a file

with open('webpage.html', 'wb') as f:
    f.write(page_contents.encode("utf8"))
Enter fullscreen mode Exit fullscreen mode

Parse and extract information

from bs4 import BeautifulSoup
Enter fullscreen mode Exit fullscreen mode
doc = BeautifulSoup(page_contents, 'html.parser')
Enter fullscreen mode Exit fullscreen mode
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'

topic_title_tags = doc.find_all('p', {'class': selection_class})
Enter fullscreen mode Exit fullscreen mode
len(topic_title_tags)
Enter fullscreen mode Exit fullscreen mode

30

topic_title_tags[:5]
Enter fullscreen mode Exit fullscreen mode

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

selection_class = 'f5 color-fg-muted mb-0 mt-1'

topic_desc_tags = doc.find_all('p', {'class': selection_class})     
Enter fullscreen mode Exit fullscreen mode
len(topic_desc_tags)
Enter fullscreen mode Exit fullscreen mode

30

topic_desc_tags[:5]
Enter fullscreen mode Exit fullscreen mode

[<p class="f5 color-fg-muted mb-0 mt-1">
3D modeling is the process of virtually developing the surface and structure of a 3D object.
</p>,
<p class="f5 color-fg-muted mb-0 mt-1">
Ajax is a technique for creating interactive web applications.
</p>,
<p class="f5 color-fg-muted mb-0 mt-1">
Algorithms are self-contained sequences that carry out a variety of tasks.
</p>,.....

topic_title_tag0 = topic_title_tags[0]
Enter fullscreen mode Exit fullscreen mode
div_tag = topic_title_tag0.parent
Enter fullscreen mode Exit fullscreen mode
topic_link_tags = doc.find_all('a', {'class': 'flex-grow-0'})
Enter fullscreen mode Exit fullscreen mode
len(topic_link_tags)
Enter fullscreen mode Exit fullscreen mode

30

topic0_url = "https://github.com" + topic_link_tags[0]['href']
print(topic0_url)
Enter fullscreen mode Exit fullscreen mode

https://github.com/topics/3d

topic_titles = []


for tag in topic_title_tags:
    topic_titles.append(tag.text)

print(topic_titles)
Enter fullscreen mode Exit fullscreen mode

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']

topic_descs = []

for tag in topic_desc_tags:
    topic_descs.append(tag.text.strip())

topic_descs[:5]
Enter fullscreen mode Exit fullscreen mode

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
'Ajax is a technique for creating interactive web applications.',
'Algorithms are self-contained sequences that carry out a variety of tasks.',
'Amp is a non-blocking concurrency library for PHP.',
'Android is an operating system built by Google designed for mobile devices.']

topic_urls = []
base_url = "https://github.com"

for tag in topic_link_tags:
    topic_urls.append(base_url + tag['href'])

topic_urls
Enter fullscreen mode Exit fullscreen mode

['https://github.com/topics/3d',
'https://github.com/topics/ajax',
'https://github.com/topics/algorithm',
'https://github.com/topics/amphp',
'https://github.com/topics/android',
'https://github.com/topics/angular',
'https://github.com/topics/ansible',
'https://github.com/topics/api',
'https://github.com/topics/arduino',
'https://github.com/topics/aspnet',
'https://github.com/topics/atom',
'https://github.com/topics/awesome']....

Create CSV FILE

import pandas as pd
Enter fullscreen mode Exit fullscreen mode
topics_dict = {
    'title': topic_titles,
    'description': topic_descs,
    'url': topic_urls
}
Enter fullscreen mode Exit fullscreen mode
topics_df = pd.DataFrame(topics_dict)
Enter fullscreen mode Exit fullscreen mode
topics_df
Enter fullscreen mode Exit fullscreen mode

title description url
0 3D 3D modeling is the process of virtually develo... https://github.com/topics/3d
1 Ajax Ajax is a technique for creating interactive w... https://github.com/topics/ajax
2 Algorithm Algorithms are self-contained sequences that c... https://github.com/topics/algorithm
3 Amp Amp is a non-blocking concurrency library for ... https://github.com/topics/amphp
4 Android Android is an operating system built by Google... https://github.com/topics/android
5 Angular Angular is an open source web application plat... https://github.com/topics/angular
6 Ansible Ansible is a simple and powerful automation en... https://github.com/topics/ansible
7 API An API (Application Programming Interface) is ... https://github.com/topics/api
8 Arduino Arduino is an open source hardware and softwar... https://github.com/topics/arduino.....

topics_df.to_csv('topics.csv', index=None)
Enter fullscreen mode Exit fullscreen mode

Getting information out of a topic page

topic_page_url = topic_urls[0]
Enter fullscreen mode Exit fullscreen mode
topic_page_url
Enter fullscreen mode Exit fullscreen mode

'https://github.com/topics/3d'

response = requests.get(topic_page_url)
Enter fullscreen mode Exit fullscreen mode
response.status_code
Enter fullscreen mode Exit fullscreen mode

200

len(response.text)
Enter fullscreen mode Exit fullscreen mode

662290

topic_doc = BeautifulSoup(response.text, 'html.parser')
Enter fullscreen mode Exit fullscreen mode
h3_selection_class = "f3 color-fg-muted text-normal lh-condensed"


repo_tags = topic_doc.find_all('h3', {'class': h3_selection_class })
Enter fullscreen mode Exit fullscreen mode
len(repo_tags)
Enter fullscreen mode Exit fullscreen mode

30

repo_tags
Enter fullscreen mode Exit fullscreen mode

[<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="/mrdoob">
mrdoob
</a> /
<a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data-view-component="true" href="/mrdoob/three.js">
three.js....

Write a function that calls out all a tags from repo_tags

a_tags = repo_tags[0].find_all('a')
Enter fullscreen mode Exit fullscreen mode
a_tags
Enter fullscreen mode Exit fullscreen mode

[<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="/mrdoob">
mrdoob...]

len(a_tags)
Enter fullscreen mode Exit fullscreen mode

2

a_tags[0].text.strip()
Enter fullscreen mode Exit fullscreen mode

'mrdoob'

a_tags[1].text.strip()
Enter fullscreen mode Exit fullscreen mode

'three.js'

base_url = 'https://github.com'

repo_url = base_url + a_tags[1]['href']
Enter fullscreen mode Exit fullscreen mode
repo_url
Enter fullscreen mode Exit fullscreen mode

'https://github.com/mrdoob/three.js'

star_tags = topic_doc.find_all('a', {'class': 'tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default'})
Enter fullscreen mode Exit fullscreen mode
len(star_tags)
Enter fullscreen mode Exit fullscreen mode

30

star_tags
Enter fullscreen mode Exit fullscreen mode

[<a aria-label="You must be signed in to star a repository" class="tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default" data-hydro-click='{"event_type":"authentication.click","payload":{"location_in_page":"star button","repository_id":576201,"auth_type":"LOG_IN","originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="b901d0051f5d392e6990ed43be8259b46c6bc69b73fa228065a5b24cb7acf2cf" data-view-component="true" href="/login?return_to=%2Fmrdoob%2Fthree.js" rel="nofollow"> <svg aria-hidden="true" class="octicon octicon-star v-align-text-bottom d-inline-block mr-2" data-view-component="true" height="16" version="1.1" viewbox="0 0 16 16" width="16">
<path d="M8 .25a.75.75 0 01.673.418l1.882 3.815 4.21.612a.75.75 0 01.416 1.279l-3.046 2.97.719 4.192a.75.75 0 01-1.088.791L8 12.347l-3.766 1.98a.75.75 0 01-1.088-.79l.72-4.194L.818 6.374a.75.75 0 01.416-1.28l4.21-.611L7.327.668A.75.75 0 018 .25zm0 2.445L6.615 5.5a.75.75 0 01-.564.41l-3.097.45 2.24 2.184a.75.75 0 01.216.664l-.528 3.084 2.769-1.456a.75.75 0 01.698 0l2.77 1.456-.53-3.084a.75.75 0 01.216-.664l2.24-2.183-3.096-.45a.75.75 0 01-.564-.41L8 2.694v.001z" fill-rule="evenodd"></path>.....

star_tags[0].text
Enter fullscreen mode Exit fullscreen mode

' \n\n\n Star\n 78.6k\n'

star_tags[0].text.strip('\n\n\n     Star\n')
Enter fullscreen mode Exit fullscreen mode

'78.6k'

def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)
Enter fullscreen mode Exit fullscreen mode
parse_star_count(star_tags[0].text.strip('\n\n\n     Star\n'))
Enter fullscreen mode Exit fullscreen mode

78600

get_repo_info(repo_tags[0], star_tags[0])
Enter fullscreen mode Exit fullscreen mode

('mrdoob', 'three.js', 78600, 'https://github.com/mrdoob/three.js')

len(repo_tags)

def get_repo_info(h3_tag, star_tag):
    # returns all the required information about a respository
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip('\n\n\n     Star\n'))
    return username, repo_name, stars, repo_url
Enter fullscreen mode Exit fullscreen mode
topic_repos_dict = {
    'username': [],
    'repo_name': [],
    'stars': [],
    'repo_url': []
}


for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i], star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])
Enter fullscreen mode Exit fullscreen mode
topic_repos_df = pd.DataFrame(topic_repos_dict)
Enter fullscreen mode Exit fullscreen mode
topic_repos_df
Enter fullscreen mode Exit fullscreen mode


username repo_name stars repo_url
0 mrdoob three.js 78600 https://github.com/mrdoob/three.js
1 libgdx libgdx 19600 https://github.com/libgdx/libgdx
2 pmndrs react-three-fiber 16600 https://github.com/pmndrs/react-three-fiber
3 BabylonJS Babylon.js 15800 https://github.com/BabylonJS/Babylon.js
4 aframevr aframe 13700 https://github.com/aframevr/aframe
5 ssloy tinyrenderer 12000 https://github.com/ssloy/tinyrenderer
6 lettier 3d-game-shaders-for-beginners 12000 https://github.com/lettier/3d-game-shaders-for...
7 FreeCAD FreeCAD 10600 https://github.com/FreeCAD/FreeCAD
8 metafizzy zdog 9000 https://github.com/metafizzy/zdog
9 CesiumGS cesium 8200 https://github.com/CesiumGS/cesium.....

import os

def get_topic_page(topic_url): 
# Download the page
    response = requests.get(topic_url)
# Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
# Parse using beautiful soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

def get_repo_info(h3_tag, star_tag):
# Returns all the required information about a respository
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip('\n\n\n     Star\n'))
    return username, repo_name, stars, repo_url

def get_topic_repos(topic_doc):
# Get h3 tags containing, repo title, repo url and username
    h3_selection_class = "f3 color-fg-muted text-normal lh-condensed"
    repo_tags = topic_doc.find_all('h3', {'class': h3_selection_class })

# Get star tags
    star_tags = topic_doc.find_all('a', {'class': 'tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default'})

# Create function name
    topic_repos_dict = {
        'username': [],
        'repo_name': [],
        'stars': [],
        'repo_url': []
    }

# Get repository information
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(topic_repos_dict)

def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print('The file {} already exists. skipping...'.format(path))
        return 

    topic_df = get_topic_repos(get_topic_page(topic_url))

    topic_df.to_csv(path, index=None )
Enter fullscreen mode Exit fullscreen mode
topic_urls[6]
Enter fullscreen mode Exit fullscreen mode

'https://github.com/topics/ansible'

get_topic_repos(get_topic_page(topic_urls[6])).to_csv('ansible.csv', index=None)
Enter fullscreen mode Exit fullscreen mode
def get_topic_titles(doc):
        selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
        topic_title_tags = doc.find_all('p', {'class': selection_class})
        topic_titles = []
        for tag in topic_title_tags:
            topic_titles.append(tag.text)
        return topic_titles

def get_topic_descs(doc):
        selection_class = 'f5 color-fg-muted mb-0 mt-1'
        topic_desc_tags = doc.find_all('p', {'class': selection_class}) 
        topic_descs = []
        for tag in topic_desc_tags:
            topic_descs.append(tag.text.strip())
        return topic_descs

def get_topic_urls(doc):
        topic_link_tags = doc.find_all('a', {'class': 'flex-grow-0'})
        topic_urls = []
        base_url = "https://github.com"
        for tag in topic_link_tags:
            topic_urls.append(base_url + tag['href'])
        return topic_urls


def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    topics_dict = {
        'title': get_topic_titles(doc),
        'description': get_topic_descs(doc),
        'url': get_topic_urls(doc)
    }

    return pd.DataFrame(topics_dict)
Enter fullscreen mode Exit fullscreen mode
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    # Create a folder
    os.makedirs("data", exist_ok =True)


    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['title']) )
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))

Enter fullscreen mode Exit fullscreen mode
scrape_topics_repos()
Enter fullscreen mode Exit fullscreen mode

Scraping list of topics
Scraping top repositories for "3D"
The file data/3D.csv already exists. skipping...
Scraping top repositories for "Ajax"
The file data/Ajax.csv already exists. skipping...
Scraping top repositories for "Algorithm"
The file data/Algorithm.csv already exists. skipping...
Scraping top repositories for "Amp"
The file data/Amp.csv already exists. skipping...
Scraping top repositories for "Android".....

Top comments (0)