I am a MySQL guy. I can query MySQL, BigQuery or SQL-like databases for my day to day business. Last week, I faced a challenge where I needed to extract some information from a production MongoDB server that involves aggregation. I had no idea how MongoDB works or how to run CRUD operation on such database. So obviously, I spent my whole weekend in learning MongoDB. In this post I am going to share the MongoDB queries I learnt, and the equivalent MySQL queries as well.
I used MongoDB v3.6, for both daemon and cli, so some functions may not work with your current setup
Populate Data and Load into MongoDB
I used the following piece of code to scrap my Github profile information into a single JSON file.
import requests
import json
username = 'mdminhazulhaque'
repos = []
page = 1
while True:
params = (
('page', page),
('type', 'owner'),
('per_page', 100)
)
response = requests.get('https://api.github.com/users/{username}/repos'\
.format(username=username), params=params)
for single in response.json():
repos.append({
"name": single.get('name', None),
"description": single.get('description', None),
"created_at": single.get('created_at', None),
"language": single.get('language', None),
"stargazers_count": single.get('stargazers_count', None)
})
if len(response.json()) < 100:
break
else:
page += 1
with open("github.json", "w") as fp:
json.dump(repos, fp, indent=4)
Then I loaded the JSON data into MongoDB using mongoimport
.
mongoimport --db local --collection repos --drop --jsonArray --file github.json
One record from the repo
collection looks like the following.
$ mongo 127.0.0.1/local
> db.repos.findOne()
{
"_id" : ObjectId("5f1cfca24662cbc419b65e29"),
"name" : "a-pdf-reader",
"description" : "A poppler based pdf reader written in Qt4",
"created_at" : "2012-10-07T20:44:48Z",
"language" : "C++",
"stargazers_count": null
}
Let's start playing with MongoDB.
SELECT
select * from repos
> db.repos.find({})
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2a"), "name" : "apt-sync", "created_at" : "2017-11-19T04:57:23Z", "language" : "Python", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2b"), "name" : "aws-cli-cheatsheet", "created_at" : "2020-02-27T16:38:21Z", "language" : null, "stargazers_count" : 52 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2c"), "name" : "banglalion-wimax-mac", "created_at" : "2015-02-25T19:15:48Z", "language" : "HTML", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2d"), "name" : "annoying-bn-text", "created_at" : "2019-06-05T17:25:05Z", "language" : "HTML", "stargazers_count" : 1 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2e"), "name" : "banglalionwimaxapi", "created_at" : "2018-10-16T13:03:53Z", "language" : "Python", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2f"), "name" : "bcm-wimax-dkms", "created_at" : "2017-05-20T18:22:07Z", "language" : "C", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e30"), "name" : "bd-mrp-api", "created_at" : "2018-12-31T20:26:48Z", "language" : "Python", "stargazers_count" : 1 }
select name, description from repos
> db.repos.find({}, {_id:0, name:1, description:1})
{ "name" : "a-pdf-reader", "description" : "A poppler based pdf reader written in Qt4" }
{ "name" : "apt-sync", "description" : ":gift: Sync APT installed package list across different machines" }
{ "name" : "aws-cli-cheatsheet", "description" : ":cloud: AWS CLI + JQ = Make life easier" }
{ "name" : "banglalion-wimax-mac", "description" : ":satellite: Banglalion WiMAX on Mac OS X" }
{ "name" : "annoying-bn-text", "description" : "Piss of your friends with awkwardly written Bengali text" }
select name, description from repos limit 2
> db.repos.aggregate([{$project:{_id:0, name:1, description:1}}, {$limit:5}])
{ "name" : "a-pdf-reader", "description" : "A poppler based pdf reader written in Qt4" }
{ "name" : "apt-sync", "description" : ":gift: Sync APT installed package list across different machines" }
select name, description from repos order by name
> db.repos.find({}, {_id:0, name:1, description:1}).sort({name:1})
{ "name" : "Android-Javascript-Bridge", "description" : "Android and Javascript two way communication" }
{ "name" : "Android-JavascriptInterface-Sample", "description" : "Call Java (Android) methods from Webview using Javascript" }
{ "name" : "Bilai-PnP-Gui", "description" : ":cat2: GUI for Bilai PnP Modems" }
{ "name" : "CPPad", "description" : "Write and run C++ codes, instantly!" }
{ "name" : "CSE10Routine", "description" : "Tabbed routine app" }
select name, stargazers_count from repos order by stargazers_count desc
> db.repos.find({}, {_id:0, name:1, stargazers_count:1}).sort({stargazers_count:-1})
{ "name" : "aws-cli-cheatsheet", "stargazers_count" : 52 }
{ "name" : "probhat-osx", "stargazers_count" : 23 }
{ "name" : "PyQt-BPNN", "stargazers_count" : 22 }
{ "name" : "foreach", "stargazers_count" : 14 }
{ "name" : "html-table-to-json", "stargazers_count" : 12 }
select name, stargazers_count from repos where stargazers_count > 20 order by stargazers_count
> db.repos.find({ stargazers_count: { $gt: 20} }, {_id:0, name:1, stargazers_count:1}).sort({stargazers_count:-1})
{ "name" : "aws-cli-cheatsheet", "stargazers_count" : 52 }
{ "name" : "probhat-osx", "stargazers_count" : 23 }
{ "name" : "PyQt-BPNN", "stargazers_count" : 22 }
select name, stargazers_count from repos where language = "Python"
> db.repos.find({ language: "Python" }, {_id:0, name:1, stargazers_count:1})
{ "name" : "apt-sync", "stargazers_count" : 0 }
{ "name" : "banglalionwimaxapi", "stargazers_count" : 0 }
{ "name" : "bd-mrp-api", "stargazers_count" : 1 }
{ "name" : "flightaware-cli", "stargazers_count" : 0 }
{ "name" : "html-table-to-json", "stargazers_count" : 12 }
select name, description from repos where description like "%cli%"
> db.repos.find({ description: /cli/i }, {_id:0, name:1, description:1})
{ "name" : "aws-cli-cheatsheet", "description" : ":cloud: AWS CLI + JQ = Make life easier" }
{ "name" : "ClipMonitor", "description" : "A tiny clip monitor app built with Qt5" }
{ "name" : "foreach", "description" : ":repeat: Read lines from file/stdin and execute them as CLI argument" }
{ "name" : "kakitangan-cli", "description" : ":briefcase: CLI for Kakitangan app, the online HR Software for Malaysian businesses" }
{ "name" : "librs232", "description" : ":paperclip: RS232 Library in C" }
select language, count(*) as count, description from repos group by language order by count
> db.repos.aggregate([{ $group: { _id: "$language", count: { $sum: 1 } } },{ $sort: { count: -1 } }])
{ "_id" : "C++", "count" : 61 }
{ "_id" : "Python", "count" : 25 }
{ "_id" : "HTML", "count" : 12 }
{ "_id" : "Java", "count" : 12 }
{ "_id" : "Shell", "count" : 10 }
select language, sum(stargazers_count) as stars, description from repos group by language order by stars
> db.repos.aggregate([{ $group: { _id: "$language", stars: { $sum: "$stargazers_count" } } },{ $sort: { stars: -1 } }])
{ "_id" : "C++", "stars" : 72 }
{ "_id" : null, "stars" : 52 }
{ "_id" : "Python", "stars" : 47 }
{ "_id" : "Shell", "stars" : 43 }
{ "_id" : "HTML", "stars" : 19 }
select name, substr(created_at, 0, 4) as created_at_year from repos order by created_at_year desc
> db.repos.aggregate([{$addFields:{"created_at_year":{$substr:["$created_at", 0, 4]}}}, {$project: {_id:0, name:1, created_at_year:1}}, {$sort: {created_at_year:-1}}])
{ "name" : "aws-cli-cheatsheet", "created_at_year" : "2020" }
{ "name" : "gitfolio", "created_at_year" : "2020" }
{ "name" : "messenger-bot", "created_at_year" : "2020" }
{ "name" : "urctl", "created_at_year" : "2020" }
{ "name" : "annoying-bn-text", "created_at_year" : "2019" }
{ "name" : "esp32-mqtt-ir-remote", "created_at_year" : "2019" }
select substr(created_at, 0, 4)) as year, count(*) as count from repos group by year order by count desc
> db.repos.aggregate([{$addFields:{"created_at_year":{$substr:["$created_at", 0, 4]}}}, { $group: { _id: "$created_at_year", count: { $sum: 1 } } },{ $sort: { count: -1 } }])
{ "_id" : "2015", "count" : 41 }
{ "_id" : "2016", "count" : 35 }
{ "_id" : "2012", "count" : 18 }
{ "_id" : "2014", "count" : 14 }
{ "_id" : "2013", "count" : 12 }
{ "_id" : "2019", "count" : 10 }
select name, unix_timestamp(str_to_date(created_at, '%Y-%M-%dT%h:%m:%sz')) from repos
> db.repos.aggregate([{$addFields: {date: { $dateFromString: { dateString: "$created_at" }}}}, {$project: {_id:0, created_at_usec: { $subtract: [ "$date", new Date("1970-01-01") ] }, name:1}}])
{ "name" : "a-pdf-reader", "created_at_usec" : NumberLong("1349642688000") }
{ "name" : "apt-sync", "created_at_usec" : NumberLong("1511067443000") }
{ "name" : "aws-cli-cheatsheet", "created_at_usec" : NumberLong("1582821501000") }
{ "name" : "banglalion-wimax-mac", "created_at_usec" : NumberLong("1424891748000") }
{ "name" : "annoying-bn-text", "created_at_usec" : NumberLong("1559755505000") }
DELETE
delete from repose where language = "C++"
> db.repos.remove({language:"C++"})
ALTER
alter table repos drop column created_at
> db.repos.update({},{$unset:{created_at:""}})
alter table repos insert column foo int
> db.repos.update({}, {$set: {foo:0}}, {multi:1})
That's all for today. Feel free to share if any of the queries can be improved. Thanks for reading.
Top comments (2)
nice post. once you get past the initial learning curve of mongodb, it truly is a thing of beauty imho :-)
Thanks man.