rails
rails copied to clipboard
Add support for managing collations in PostgreSQL
Summary
Add support for managing collations in PostgreSQL.
https://www.postgresql.org/docs/current/collation.html
Add collations
, create_collation
, and drop_collation
to
PostgreSQLAdapter
, and AbstractAdapter
as empty methods. There is
existing support for collations but only as a column modifier. A simple use
case is adding a case-insensitive string field. In the past, we might have
downcased on save, but then you'd need to remember to make your uniqueness
constraint case-insensitive and to downcase on find, search, etc. We might
have used citext
, a module that must be installed. Current PostgreSQL
documentation tells us to "Consider using nondeterministic collations".
class CreateUsers < ActiveRecord::Migration[7.1]
def change
create_collation "case_insensitive",
provider: "icu",
locale: "und-u-ks-level2",
deterministic: false
create_collation "ignore_accents",
provider: "icu",
locale: "und-u-ks-level1-kc-true",
deterministic: false
create_table :users do |t|
t.string :email, collation: "case_insensitive"
t.string :name, collation: "ignore_accents"
t.timestamps
end
add_index :users, :email, unique: true
end
end
ActiveRecord::Base.connection.collations
# ["case_insensitive", "ignore_accents"]
User.create!(email: "[email protected]", name: "dan")
# works
User.create!(email: "[email protected]", name: "dan")
# raises ActiveRecord::RecordNotUnique
class User < ApplicationRecord
validates :email, uniqueness: true
end
# uniqueness validations just work
User.create!(email: "[email protected]", name: "dan")
# raises ActiveRecord::RecordInvalid
ActiveRecord::Base.connection.drop_collation "case_insensitive",
provider: "icu", locale: "und-u-ks-level2", deterministic: false
# raises ActiveRecord::StatementInvalid because of the email column
Other Information
Prevents needing structure.sql
just for collations.
I wrote drop_collation
to always be rollback safe.
To keep things simple, I chose not to add 2 things:
- a
:from
option oncreate_collation
- I think this is less likely to be used - a
:force
option ondrop_collation
- dangerous and less likely to be used
I'd like to know if this requirement can be achived by setting { case_sensitive: false }
to the uniqueness validator as follows.
- User model
class User < ApplicationRecord
validates :email, uniqueness: { case_sensitive: false }
end
- Migration
class CreateUsers < ActiveRecord::Migration[7.0]
def change
create_table :users do |t|
t.string :email
t.string :name
t.timestamps
end
add_index :users, :email, unique: true
end
end
- Rails console to see the second user create will fail with
Validation failed: Email has already been taken (ActiveRecord::RecordInvalid)
% bin/rails c
Loading development environment (Rails 7.0.3.1)
irb(main):001:0> User.create!(email: "[email protected]", name: "dan")
TRANSACTION (0.1ms) BEGIN
User Exists? (0.5ms) SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2 [["email", "[email protected]"], ["LIMIT", 1]]
User Create (0.5ms) INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id" [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-07 04:03:51.117698"], ["updated_at", "2022-08-07 04:03:51.117698"]]
TRANSACTION (0.6ms) COMMIT
=>
#<User:0x00000001074506c0
id: 1,
email: "[email protected]",
name: "dan",
created_at: Sun, 07 Aug 2022 04:03:51.117698000 UTC +00:00,
updated_at: Sun, 07 Aug 2022 04:03:51.117698000 UTC +00:00>
irb(main):002:0> User.create!(email: "[email protected]", name: "dan")
TRANSACTION (0.7ms) BEGIN
User Exists? (0.7ms) SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2 [["email", "[email protected]"], ["LIMIT", 1]]
TRANSACTION (0.3ms) ROLLBACK
/Users/yahonda/.rbenv/versions/3.1.2/lib/ruby/gems/3.1.0/gems/activerecord-7.0.3.1/lib/active_record/validations.rb:80:in `raise_validation_error': Validation failed: Email has already been taken (ActiveRecord::RecordInvalid)
irb(main):003:0>
I'd like to know if this requirement can be achived by setting
{ case_sensitive: false }
to the uniqueness validator as follows.
thanks for looking @yahonda that's how I usually start, but suppose later on we need to run some code before creation
class User < ApplicationRecord
validates :email, uniqueness: { case_sensitive: false }
before_create { sleep 3 } # simulate network call, bad but it happens
end
now we get
$ bin/rails c
Loading development environment (Rails 7.1.0.alpha)
irb(main):001:1* def simulate
irb(main):002:1* t1 = Thread.new { User.create!(email: "[email protected]", name: "dan") }
irb(main):003:1* t2 = Thread.new { User.create!(email: "[email protected]", name: "dan") }
irb(main):004:1* t1.join
irb(main):005:1* t2.join
irb(main):006:0> end
=> :simulate
irb(main):007:0> simulate
TRANSACTION (0.1ms) BEGIN
TRANSACTION (0.1ms) BEGIN
User Exists? (2.9ms) SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2 [["email", "[email protected]"], ["LIMIT", 1]]
User Exists? (3.2ms) SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2 [["email", "[email protected]"], ["LIMIT", 1]]
User Create (1.2ms) INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id" [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-07 19:33:46.334777"], ["updated_at", "2022-08-07 19:33:46.334777"]]
User Create (0.9ms) INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id" [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-07 19:33:46.334914"], ["updated_at", "2022-08-07 19:33:46.334914"]]
TRANSACTION (0.7ms) COMMIT
TRANSACTION (0.7ms) COMMIT
=> #<Thread:0x000000010aaf7c50 (irb):3 dead>
Signup and emails are a naive example. Think of a system that sends millions of events, over some amount of time, from a source that can't guarantee event uniqueness or casing. Even with a 1 in 10,000 chance of saving a dup, you end up with duplicate records that cause issues and take up time.
There are things that you can do in the application like downcasing, locks, and mutexes but as a database maintainer, there's no way you can guarantee that logic gets written correctly and stays correct. At the DB level, there are options like an expression index with lower
or using citext
that are supported in rails. To me, collations are appealing because they attack the issue at the most basic level so everything that follows just works, they are more configurable than citext
, and PG is already using them. But I can see them being a bit niche.
At the DB level, there are options like an expression index with lower or using citext that are supported in rails.
I believe this is the way most applications deal with case-insensitive uniqueness.
But I can see them being a bit niche.
Honestly, I haven't ever used collations or seen someone used. Maybe I was doing something wrong? 🤷 Maybe it is better to implement this PR as a gem?
Thanks for the explanation.
I understand there are some cases that database level validation is necessary.
Still, I'm not inclined to implement create_collation
related migrations because although this change is relatively large, CREATE COLLATION
is only supported with PostgreSQL, not supported by SQLite and MySQL.
https://www.postgresql.org/docs/current/sql-createcollation.html
There is a CREATE COLLATION statement in the SQL standard, but it is limited to copying an existing collation. The syntax to create a new collation is a PostgreSQL extension.
Here are possible solutions: While both of them may not be perfect fit for your use case, Rails does not intend to implement all database specific features.
- Run 'create collation
raw sql statements using
execute` method
As this pull request example is making use of it, Rails already supports :collation
option for columns.
https://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/SchemaStatements.html#method-i-add_column
So would you execute create collation
raw sql statements via execute
method in each migration and specify the newly created collation name t.string <column name>, collation: <newly created collation>
.
- Generated column
If you are running Rails 7.0 and PostgreSQL 12 or higher, generated columns may help. https://guides.rubyonrails.org/active_record_postgresql.html#generated-columns
- Migrations to user
class CreateUsers < ActiveRecord::Migration[7.0]
def change
create_table :users do |t|
t.string :email
t.string :name
t.virtual :lower_email, type: :string, as: 'lower(email)', stored: true
t.timestamps
end
add_index :users, :email, unique: true
add_index :users, :lower_email, unique: true
end
end
- User model
validates :email, uniqueness: true
is not necessary because the database unique index guarantees the uniqueness.
class User < ApplicationRecord
end
- Adding another user whose email is uppercase gets
PG::UniqueViolation:
at the database level
% bin/rails c
Loading development environment (Rails 7.0.3.1)
irb(main):001:0> User.create!(email: "[email protected]", name: "dan")
TRANSACTION (1.9ms) BEGIN
User Create (1.2ms) INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id" [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-09 11:27:16.678218"], ["updated_at", "2022-08-09 11:27:16.678218"]]
TRANSACTION (0.4ms) COMMIT
=>
#<User:0x000000010999f4a8
id: 1,
email: "[email protected]",
name: "dan",
lower_email: nil,
created_at: Tue, 09 Aug 2022 11:27:16.678218000 UTC +00:00,
updated_at: Tue, 09 Aug 2022 11:27:16.678218000 UTC +00:00>
irb(main):002:0> User.create!(email: "[email protected]", name: "dan")
TRANSACTION (1.7ms) BEGIN
User Create (3.1ms) INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id" [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-09 11:27:30.695152"], ["updated_at", "2022-08-09 11:27:30.695152"]]
TRANSACTION (0.3ms) ROLLBACK
/Users/yahonda/.rbenv/versions/3.1.2/lib/ruby/gems/3.1.0/gems/activerecord-7.0.3.1/lib/active_record/connection_adapters/postgresql_adapter.rb:768:in `exec_params': PG::UniqueViolation: ERROR: duplicate key value violates unique constraint "index_users_on_lower_email" (ActiveRecord::RecordNotUnique)
DETAIL: Key (lower_email)=([email protected]) already exists.
/Users/yahonda/.rbenv/versions/3.1.2/lib/ruby/gems/3.1.0/gems/activerecord-7.0.3.1/lib/active_record/connection_adapters/postgresql_adapter.rb:768:in `exec_params': ERROR: duplicate key value violates unique constraint "index_users_on_lower_email" (PG::UniqueViolation)
DETAIL: Key (lower_email)=([email protected]) already exists.
irb(main):003:0>
I'm closing this pull request mainly because creating custom collations is not required for many users. If someone needs it, custom collations can be created via raw SQL statements as explained in https://github.com/rails/rails/pull/45776#issuecomment-1209284463
Thanks again for the pull request. I'm looking forward to your future contributions.
@yahonda following your instructions on how to add a collation using execute
and :collation
option is not working with rake db:prepare
or rake db:setup
.
So would you execute
create collation
raw sql statements viaexecute
method in each migration and specify the newly created collation namet.string <column name>, collation: <newly created collation>
.
class AddSemverSort < ActiveRecord::Migration[7.0]
def up
execute "CREATE COLLATION semver (LOCALE = 'en-US-u-kn-true', PROVIDER = 'icu');"
change_column :versions, :version, :string, collation: 'semver'
end
def down
change_column :versions, :version, :string, collation: nil
execute "DROP COLLATION semver;"
end
end
ActiveRecord::Schema[7.0].define(version: 2023_01_16_193606) do
# These are extensions that must be enabled in order to support this database
enable_extension "pg_trgm"
enable_extension "plpgsql"
create_table "packages", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.string "name", null: false
t.text "readme"
t.string "source_type", null: false
t.string "package_type", null: false
t.bigint "downloads", default: 0, null: false
t.bigint "stargazers_count", default: 0, null: false
t.jsonb "repository", default: {}, null: false
t.jsonb "metadata", default: {}, null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["name"], name: "index_packages_on_name", unique: true
t.index ["name"], name: "index_packages_on_name_fuzzy", opclass: :gist_trgm_ops, using: :gist
t.index ["package_type"], name: "index_packages_on_package_type"
end
create_table "versions", force: :cascade do |t|
t.string "status", null: false
t.string "version", null: false, collation: "semver"
t.string "license", null: false
t.jsonb "engines", default: {}, null: false
t.jsonb "metadata", default: {}, null: false
t.uuid "package_id", null: false
t.index ["package_id"], name: "index_versions_on_package_id"
t.index ["status"], name: "index_versions_on_status"
t.index ["version"], name: "index_versions_on_version"
end
add_foreign_key "versions", "packages"
end
ActiveRecord::StatementInvalid: PG::UndefinedObject: ERROR: collation "semver" for encoding "UTF8" does not exist
LINE 1: ...ter varying NOT NULL, "version" character varying COLLATE "s...