Today we are going to discuss GROUP BY with aggregation(SUM)
Suppose we have Tables of users , playlists and songs.
- Each user can have one or more playlists.
- Each playlist can have one or more songs
# models
class User(models.Model):
name = models.CharField(max_length=127)
@property
def playlists_total_length(self) -> int:
return sum([p.songs_total_length for p in self.playlists.all()])
class Playlist(models.Model):
user = models.ForeignKey(User, related_name='playlists')
name = models.CharField(max_length=255)
@property
def songs_total_length(self) -> int:
return sum([song.length for song in self.songs.all()])
class Song(models.Model):
playlist = models.ForeignKey(Playlist, related_name='songs')
title = models.CharField(max_length=255)
length = models.PositiveIntegerField(help_text=_('In seconds'))
So if our goal is to list all users with playlists_total_length
How bad is this code?
If we had for example:
- 10 Users
- 10 Playlists per user
- 10 songs per playlist
Total objects to fetch = 10 (Users) + 10*10 (Playlists) + 10*10*10 (Songs)
1110 DB rows in 3 different tables for just 10 users
Solution: GROUP BY with aggregation(SUM)
-
Use aggregation to calculate total length of all songs:
Song.objects.aggregate(Sum('length'))
SELECT SUM("song"."length") AS "length__sum" FROM "song";
Result: {'length__sum': 490}
-
Use annotate for Group by:
Song.objects.annotate(Sum('length'))
SELECT * , SUM("song"."length") AS "length__sum"
FROM "song"
GROUP BY "song"."id",
"song"."playlist_id",
"song"."title",
"song"."length
Result: <QuerySet [Song(...),Song(...),...]>
-
Use values before annotate to group by specific columns:
Song.objects.values('playlist_id').annotate(Sum('length'))
SELECT "song"."playlist_id", SUM("song"."length") AS "length__sum"
FROM "song"
GROUP BY "song"."playlist_id"
Result: <QuerySet [{'playlist_id': 2, 'length__sum': 310}, ...]>
- Use values_list to select sum values only not dict
Song.objects.values('playlist_id').annotate(Sum('length')).values_list('length__sum')
We can compress this query and still getting the same generated sql query:
Song.objects.values('playlist_id').values_list(Sum('length'))
SELECT SUM("song"."length") AS "length__sum"
FROM "song"
GROUP BY "song"."playlist_id"
Result: <QuerySet [(310,), (180,)]>
List all Playlists with total songs length:
songs_subquery = Subquery(
queryset=Song.objects
.values('playlist_id') \
.filter(playlist__id=OuterRef('id')) \
.values_list(Sum('length')),
output_field=IntegerField())
queryset = Playlist.objects.annotate(
_songs_total_length=songs_subquery)
OuterRef will make it possible to access fields from parent query when you are in subquery. OuterRef('id') -> "playlist"."id"
SELECT "playlist"."id",
"playlist"."user_id",
"playlist"."name",
(
SELECT SUM("song"."length")
FROM "song"
WHERE "song"."playlist_id" = ("playlist"."id")
GROUP BY "song"."playlist_id" LIMIT 1
) AS "songs_total_length"
FROM "playlist";
List all Users with total songs length:
songs_subquery = Subquery(
queryset=Song.objects
.values('playlist__id') \
.filter(playlist__id=OuterRef('id')) \
.values_list(Sum('length')),
output_field=IntegerField(default=0))
playlists_subquery = Subquery(
queryset=Playlist.objects
.values('user__id') \
.filter(user__id=OuterRef('id')) \
.annotate(_songs_total_length=Sum(songs_subquery)) \
.values_list('_songs_total_length'),
output_field=IntegerField(default=0))
queryset = User.objects.annotate(_playlists_total_length=playlists_subquery)
SELECT "user"."id",
"user"."name",
(SELECT SUM((SELECT SUM(U0."length") AS "length__sum"
FROM "song" U0
WHERE U0."playlist_id" = V0."id"
GROUP BY U0."playlist_id")) AS "_songs_total_length"
FROM "playlist" V0
WHERE V0."user_id" = "user"."id"
GROUP BY V0."user_id") AS "_playlists_total_length"
FROM "user";
It will be so hard to write this complex query in everywhere you want
So here is the final clean code:
class UserQuerySet(models.QuerySet):
@classmethod
def playlists_total_length(cls):
playlist_annotation = PlaylistQuerySet.songs_total_length()
queryset = Playlist.objects \
.values('user__id') \
.filter(user__id=OuterRef('id')) \
.values_list(Sum(playlist_annotation))
return Subquery(
queryset=queryset,
output_field=IntegerField()
)
def collect(self):
return self.annotate(_playlists_total_length=self.playlists_total_length())
class User(models.Model):
objects = UserQuerySet.as_manager()
name = models.CharField(max_length=255)
@property
def playlists_total_length(self):
if hasattr(self, '_playlists_total_length'):
return self._playlists_total_length
return sum([p.songs_total_length for p in self.playlists.all()])
class PlaylistQuerySet(models.QuerySet):
@classmethod
def songs_total_length(cls):
queryset = Song.objects \
.values('playlist__id') \
.filter(playlist__id=OuterRef('id')) \
.values_list(Sum('length')) # The SUM aggregation
return Subquery(
queryset=queryset,
output_field=models.IntegerField()
)
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(models.Model):
objects = PlaylistQuerySet.as_manager()
user = models.ForeignKey(User, related_name='playlists')
name = models.CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, '_songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
class Song(models.Model):
playlist = models.ForeignKey(Playlist, related_name='songs')
title = models.CharField(max_length=255)
length = models.PositiveIntegerField(help_text='In seconds')
Thanks, Ivo Donchev
Top comments (1)
Just excellent! Optimized the download counter with fk linked files!