rails icon indicating copy to clipboard operation
rails copied to clipboard

Add support for managing collations in PostgreSQL

Open daniel-bryant opened this issue 1 year ago • 4 comments

Summary

Add support for managing collations in PostgreSQL.

https://www.postgresql.org/docs/current/collation.html

Add collations, create_collation, and drop_collation to PostgreSQLAdapter, and AbstractAdapter as empty methods. There is existing support for collations but only as a column modifier. A simple use case is adding a case-insensitive string field. In the past, we might have downcased on save, but then you'd need to remember to make your uniqueness constraint case-insensitive and to downcase on find, search, etc. We might have used citext, a module that must be installed. Current PostgreSQL documentation tells us to "Consider using nondeterministic collations".

class CreateUsers < ActiveRecord::Migration[7.1]
  def change
    create_collation "case_insensitive",
      provider: "icu",
      locale: "und-u-ks-level2",
      deterministic: false

    create_collation "ignore_accents",
      provider: "icu",
      locale: "und-u-ks-level1-kc-true",
      deterministic: false

    create_table :users do |t|
      t.string :email, collation: "case_insensitive"
      t.string :name, collation: "ignore_accents"
      t.timestamps
    end
    add_index :users, :email, unique: true
  end
end

ActiveRecord::Base.connection.collations
# ["case_insensitive", "ignore_accents"]

User.create!(email: "[email protected]", name: "dan")
# works

User.create!(email: "[email protected]", name: "dan")
# raises ActiveRecord::RecordNotUnique

class User < ApplicationRecord
  validates :email, uniqueness: true
end
# uniqueness validations just work

User.create!(email: "[email protected]", name: "dan")
# raises ActiveRecord::RecordInvalid

ActiveRecord::Base.connection.drop_collation "case_insensitive",
  provider: "icu", locale: "und-u-ks-level2", deterministic: false
# raises ActiveRecord::StatementInvalid because of the email column

Other Information

Prevents needing structure.sql just for collations. I wrote drop_collation to always be rollback safe. To keep things simple, I chose not to add 2 things:

  • a :from option on create_collation - I think this is less likely to be used
  • a :force option on drop_collation - dangerous and less likely to be used

daniel-bryant avatar Aug 05 '22 18:08 daniel-bryant

I'd like to know if this requirement can be achived by setting { case_sensitive: false } to the uniqueness validator as follows.

  • User model
class User < ApplicationRecord
  validates :email, uniqueness: { case_sensitive: false }
end
  • Migration
class CreateUsers < ActiveRecord::Migration[7.0]
  def change
    create_table :users do |t|
      t.string :email
      t.string :name
      t.timestamps
    end
    add_index :users, :email, unique: true
  end
end
  • Rails console to see the second user create will fail with Validation failed: Email has already been taken (ActiveRecord::RecordInvalid)
% bin/rails c
Loading development environment (Rails 7.0.3.1)
irb(main):001:0> User.create!(email: "[email protected]", name: "dan")
  TRANSACTION (0.1ms)  BEGIN
  User Exists? (0.5ms)  SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2  [["email", "[email protected]"], ["LIMIT", 1]]
  User Create (0.5ms)  INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id"  [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-07 04:03:51.117698"], ["updated_at", "2022-08-07 04:03:51.117698"]]
  TRANSACTION (0.6ms)  COMMIT
=>
#<User:0x00000001074506c0
 id: 1,
 email: "[email protected]",
 name: "dan",
 created_at: Sun, 07 Aug 2022 04:03:51.117698000 UTC +00:00,
 updated_at: Sun, 07 Aug 2022 04:03:51.117698000 UTC +00:00>
irb(main):002:0> User.create!(email: "[email protected]", name: "dan")
  TRANSACTION (0.7ms)  BEGIN
  User Exists? (0.7ms)  SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2  [["email", "[email protected]"], ["LIMIT", 1]]
  TRANSACTION (0.3ms)  ROLLBACK
/Users/yahonda/.rbenv/versions/3.1.2/lib/ruby/gems/3.1.0/gems/activerecord-7.0.3.1/lib/active_record/validations.rb:80:in `raise_validation_error': Validation failed: Email has already been taken (ActiveRecord::RecordInvalid)
irb(main):003:0>

yahonda avatar Aug 07 '22 04:08 yahonda

I'd like to know if this requirement can be achived by setting { case_sensitive: false } to the uniqueness validator as follows.

thanks for looking @yahonda that's how I usually start, but suppose later on we need to run some code before creation

class User < ApplicationRecord
  validates :email, uniqueness: { case_sensitive: false }
  before_create { sleep 3 } # simulate network call, bad but it happens
end

now we get

$ bin/rails c       
Loading development environment (Rails 7.1.0.alpha)
irb(main):001:1* def simulate
irb(main):002:1*   t1 = Thread.new { User.create!(email: "[email protected]", name: "dan") }
irb(main):003:1*   t2 = Thread.new { User.create!(email: "[email protected]", name: "dan") }
irb(main):004:1*   t1.join
irb(main):005:1*   t2.join
irb(main):006:0> end
=> :simulate
irb(main):007:0> simulate
  TRANSACTION (0.1ms)  BEGIN
  TRANSACTION (0.1ms)  BEGIN
  User Exists? (2.9ms)  SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2  [["email", "[email protected]"], ["LIMIT", 1]]
  User Exists? (3.2ms)  SELECT 1 AS one FROM "users" WHERE LOWER("users"."email") = LOWER($1) LIMIT $2  [["email", "[email protected]"], ["LIMIT", 1]]
  User Create (1.2ms)  INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id"  [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-07 19:33:46.334777"], ["updated_at", "2022-08-07 19:33:46.334777"]]
  User Create (0.9ms)  INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id"  [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-07 19:33:46.334914"], ["updated_at", "2022-08-07 19:33:46.334914"]]
  TRANSACTION (0.7ms)  COMMIT
  TRANSACTION (0.7ms)  COMMIT
=> #<Thread:0x000000010aaf7c50 (irb):3 dead>

Signup and emails are a naive example. Think of a system that sends millions of events, over some amount of time, from a source that can't guarantee event uniqueness or casing. Even with a 1 in 10,000 chance of saving a dup, you end up with duplicate records that cause issues and take up time.

There are things that you can do in the application like downcasing, locks, and mutexes but as a database maintainer, there's no way you can guarantee that logic gets written correctly and stays correct. At the DB level, there are options like an expression index with lower or using citext that are supported in rails. To me, collations are appealing because they attack the issue at the most basic level so everything that follows just works, they are more configurable than citext, and PG is already using them. But I can see them being a bit niche.

daniel-bryant avatar Aug 07 '22 23:08 daniel-bryant

At the DB level, there are options like an expression index with lower or using citext that are supported in rails.

I believe this is the way most applications deal with case-insensitive uniqueness.

But I can see them being a bit niche.

Honestly, I haven't ever used collations or seen someone used. Maybe I was doing something wrong? 🤷 Maybe it is better to implement this PR as a gem?

fatkodima avatar Aug 08 '22 14:08 fatkodima

Thanks for the explanation.

I understand there are some cases that database level validation is necessary. Still, I'm not inclined to implement create_collation related migrations because although this change is relatively large, CREATE COLLATION is only supported with PostgreSQL, not supported by SQLite and MySQL.

https://www.postgresql.org/docs/current/sql-createcollation.html

There is a CREATE COLLATION statement in the SQL standard, but it is limited to copying an existing collation. The syntax to create a new collation is a PostgreSQL extension.

Here are possible solutions: While both of them may not be perfect fit for your use case, Rails does not intend to implement all database specific features.

  1. Run 'create collationraw sql statements usingexecute` method

As this pull request example is making use of it, Rails already supports :collation option for columns. https://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/SchemaStatements.html#method-i-add_column

So would you execute create collation raw sql statements via execute method in each migration and specify the newly created collation name t.string <column name>, collation: <newly created collation>.

  1. Generated column

If you are running Rails 7.0 and PostgreSQL 12 or higher, generated columns may help. https://guides.rubyonrails.org/active_record_postgresql.html#generated-columns

  • Migrations to user
class CreateUsers < ActiveRecord::Migration[7.0]
  def change
    create_table :users do |t|
      t.string :email
      t.string :name
      t.virtual :lower_email, type: :string, as: 'lower(email)', stored: true
      t.timestamps
    end
    add_index :users, :email, unique: true
    add_index :users, :lower_email, unique: true
  end
end
  • User model validates :email, uniqueness: true is not necessary because the database unique index guarantees the uniqueness.
class User < ApplicationRecord
end
  • Adding another user whose email is uppercase gets PG::UniqueViolation: at the database level
% bin/rails c
Loading development environment (Rails 7.0.3.1)
irb(main):001:0> User.create!(email: "[email protected]", name: "dan")
  TRANSACTION (1.9ms)  BEGIN
  User Create (1.2ms)  INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id"  [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-09 11:27:16.678218"], ["updated_at", "2022-08-09 11:27:16.678218"]]
  TRANSACTION (0.4ms)  COMMIT
=>
#<User:0x000000010999f4a8
 id: 1,
 email: "[email protected]",
 name: "dan",
 lower_email: nil,
 created_at: Tue, 09 Aug 2022 11:27:16.678218000 UTC +00:00,
 updated_at: Tue, 09 Aug 2022 11:27:16.678218000 UTC +00:00>
irb(main):002:0> User.create!(email: "[email protected]", name: "dan")
  TRANSACTION (1.7ms)  BEGIN
  User Create (3.1ms)  INSERT INTO "users" ("email", "name", "created_at", "updated_at") VALUES ($1, $2, $3, $4) RETURNING "id"  [["email", "[email protected]"], ["name", "dan"], ["created_at", "2022-08-09 11:27:30.695152"], ["updated_at", "2022-08-09 11:27:30.695152"]]
  TRANSACTION (0.3ms)  ROLLBACK
/Users/yahonda/.rbenv/versions/3.1.2/lib/ruby/gems/3.1.0/gems/activerecord-7.0.3.1/lib/active_record/connection_adapters/postgresql_adapter.rb:768:in `exec_params': PG::UniqueViolation: ERROR:  duplicate key value violates unique constraint "index_users_on_lower_email" (ActiveRecord::RecordNotUnique)
DETAIL:  Key (lower_email)=([email protected]) already exists.
/Users/yahonda/.rbenv/versions/3.1.2/lib/ruby/gems/3.1.0/gems/activerecord-7.0.3.1/lib/active_record/connection_adapters/postgresql_adapter.rb:768:in `exec_params': ERROR:  duplicate key value violates unique constraint "index_users_on_lower_email" (PG::UniqueViolation)
DETAIL:  Key (lower_email)=([email protected]) already exists.
irb(main):003:0>

yahonda avatar Aug 09 '22 11:08 yahonda

I'm closing this pull request mainly because creating custom collations is not required for many users. If someone needs it, custom collations can be created via raw SQL statements as explained in https://github.com/rails/rails/pull/45776#issuecomment-1209284463

Thanks again for the pull request. I'm looking forward to your future contributions.

yahonda avatar Nov 22 '22 14:11 yahonda

@yahonda following your instructions on how to add a collation using execute and :collation option is not working with rake db:prepare or rake db:setup.

So would you execute create collation raw sql statements via execute method in each migration and specify the newly created collation name t.string <column name>, collation: <newly created collation>.

class AddSemverSort < ActiveRecord::Migration[7.0]
  def up
    execute "CREATE COLLATION semver (LOCALE = 'en-US-u-kn-true', PROVIDER = 'icu');"
    change_column :versions, :version, :string, collation: 'semver'
  end

  def down
    change_column :versions, :version, :string, collation: nil
    execute "DROP COLLATION semver;"
  end
end
ActiveRecord::Schema[7.0].define(version: 2023_01_16_193606) do
  # These are extensions that must be enabled in order to support this database
  enable_extension "pg_trgm"
  enable_extension "plpgsql"

  create_table "packages", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
    t.string "name", null: false
    t.text "readme"
    t.string "source_type", null: false
    t.string "package_type", null: false
    t.bigint "downloads", default: 0, null: false
    t.bigint "stargazers_count", default: 0, null: false
    t.jsonb "repository", default: {}, null: false
    t.jsonb "metadata", default: {}, null: false
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
    t.index ["name"], name: "index_packages_on_name", unique: true
    t.index ["name"], name: "index_packages_on_name_fuzzy", opclass: :gist_trgm_ops, using: :gist
    t.index ["package_type"], name: "index_packages_on_package_type"
  end

  create_table "versions", force: :cascade do |t|
    t.string "status", null: false
    t.string "version", null: false, collation: "semver"
    t.string "license", null: false
    t.jsonb "engines", default: {}, null: false
    t.jsonb "metadata", default: {}, null: false
    t.uuid "package_id", null: false
    t.index ["package_id"], name: "index_versions_on_package_id"
    t.index ["status"], name: "index_versions_on_status"
    t.index ["version"], name: "index_versions_on_version"
  end

  add_foreign_key "versions", "packages"
end
ActiveRecord::StatementInvalid: PG::UndefinedObject: ERROR:  collation "semver" for encoding "UTF8" does not exist
LINE 1: ...ter varying NOT NULL, "version" character varying COLLATE "s...

jonian avatar Jan 21 '23 10:01 jonian